In [1]:
import numpy as np
import pandas as pd
from typing import Dict, Tuple
from scipy import stats
from IPython.display import Image
from sklearn.datasets import load_iris, load_boston
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score 
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.svm import SVC, NuSVC, LinearSVC, OneClassSVM, SVR, NuSVR, LinearSVR
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
sns.set(style="ticks")

In [2]:
def accuracy_score_for_classes(
    y_true: np.ndarray, 
    y_pred: np.ndarray) -> Dict[int, float]:
    """
    Вычисление метрики accuracy для каждого класса
    y_true - истинные значения классов
    y_pred - предсказанные значения классов
    Возвращает словарь: ключ - метка класса, 
    значение - Accuracy для данного класса
    """
    # Для удобства фильтрации сформируем Pandas DataFrame 
    d = {'t': y_true, 'p': y_pred}
    df = pd.DataFrame(data=d)
    # Метки классов
    classes = np.unique(y_true)
    # Результирующий словарь
    res = dict()
    # Перебор меток классов
    for c in classes:
        # отфильтруем данные, которые соответствуют 
        # текущей метке класса в истинных значениях
        temp_data_flt = df[df['t']==c]
        # расчет accuracy для заданной метки класса
        temp_acc = accuracy_score(
            temp_data_flt['t'].values, 
            temp_data_flt['p'].values)
        # сохранение результата в словарь
        res[c] = temp_acc
    return res

def print_accuracy_score_for_classes(
    y_true: np.ndarray, 
    y_pred: np.ndarray):
    """
    Вывод метрики accuracy для каждого класса
    """
    accs = accuracy_score_for_classes(y_true, y_pred)
    if len(accs)>0:
        print('Метка \t Accuracy')
    for i in accs:
        print('{} \t {}'.format(i, accs[i]))

In [8]:
df = pd.read_csv('imdb_sup.csv')
df

Unnamed: 0,Review,Rating,Sentiment
0,"Kurt Russell's chameleon-like performance, cou...",10,1
1,It was extremely low budget(it some scenes it ...,8,1
2,James Cagney is best known for his tough chara...,8,1
3,"Following the brilliant ""Goyôkiba"" (aka. ""Hanz...",8,1
4,One of the last classics of the French New Wav...,10,1
...,...,...,...
49995,(spoiler) it could be the one the worst movie ...,4,0
49996,"So, you've seen the Romero movies, yes? And yo...",1,0
49997,Just listen to the Broadway cast album and to ...,3,0
49998,I have been a fan of the Carpenters for a long...,3,0


In [32]:
text_df = df[37000:38000].reset_index()
text_df.drop('Rating', axis=1, inplace=True)
text_df

Unnamed: 0,index,Review,Sentiment
0,37000,Nicely done evil little comedy pitting the FBI...,1
1,37001,"Dead Man Walking, absolutely brilliant, in tea...",1
2,37002,"Alfred Hitchcock's remake of ""The Man Who Who ...",1
3,37003,"Exclusively for Coop's lovers, though Clint Ea...",1
4,37004,"Minor Spoilers<br /><br />In Chicago, Grace Be...",1
...,...,...,...
995,37995,"Firstly, I am a huge fan of crap films. B grad...",0
996,37996,"Alright, we start in the office of a shrink, a...",0
997,37997,"Decent but overrated dramatic thriller, film a...",0
998,37998,Old bat transforms to younger OK looking girl ...,0


In [33]:
text_df.shape

(1000, 3)

In [34]:
text_df['Sentiment'].unique()

array([1, 0])

Наш целевой признак - столбец Sentiment, который имеет всего 2 значения: 1 - если комментарий о фильме был положительным и 0 - если комментарий о фильме был отрицательным

In [39]:
vocab_list = text_df['Review'].tolist()

In [40]:
vocabVect = CountVectorizer()
vocabVect.fit(vocab_list)
corpusVocab = vocabVect.vocabulary_
print('Количество сформированных признаков - {}'.format(len(corpusVocab)))

Количество сформированных признаков - 17754


In [41]:
for i in list(corpusVocab)[0:10]:
    print('{}={}'.format(i, corpusVocab[i]))

nicely=10722
done=4738
evil=5503
little=9293
comedy=3148
pitting=11790
the=15853
fbi=5846
against=543
organized=11129


# Векторизация признаков на основе CountVectorizer
Подсчитывает количество слов словаря, входящих в данный текст

In [42]:
test_features = vocabVect.transform(vocab_list)

In [43]:
test_features

<1000x17754 sparse matrix of type '<class 'numpy.int64'>'
	with 137483 stored elements in Compressed Sparse Row format>

In [44]:
test_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [45]:
# Размер нулевой строки
len(test_features.todense()[0].getA1())

17754

In [46]:
# Непустые значения нулевой строки
[i for i in test_features.todense()[0].getA1() if i>0]

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 7,
 1,
 1,
 1,
 1,
 2]

# Векторизация признаков на основе TfidfVectorizer
Вычисляет специфичность текста в корпусе текстов на основе метрики TF-IDF

In [47]:
tfidfv = TfidfVectorizer(ngram_range=(1,2))
tfidf_ngram_features = tfidfv.fit_transform(vocab_list)
tfidf_ngram_features

<1000x136925 sparse matrix of type '<class 'numpy.float64'>'
	with 350007 stored elements in Compressed Sparse Row format>

In [48]:
tfidf_ngram_features.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

# Оценка качества классификации

In [49]:
def VectorizeAndClassify(vectorizers_list, classifiers_list):
    for v in vectorizers_list:
        for c in classifiers_list:
            pipeline1 = Pipeline([("vectorizer", v), ("classifier", c)])
            score = cross_val_score(pipeline1, text_df['Review'], text_df['Sentiment'], scoring='accuracy', cv=3).mean()
            print('Векторизация - {}'.format(v))
            print('Модель для классификации - {}'.format(c))
            print('Accuracy = {}'.format(score))
            print('===========================')

In [50]:
vectorizers_list = [CountVectorizer(vocabulary = corpusVocab), TfidfVectorizer(vocabulary = corpusVocab)]
classifiers_list = [ComplementNB(), KNeighborsClassifier()]
VectorizeAndClassify(vectorizers_list, classifiers_list)

Векторизация - CountVectorizer(vocabulary={'00': 0, '000': 1, '00000001': 2, '000dm': 3,
                            '007': 4, '00am': 5, '0148': 6, '04': 7, '05': 8,
                            '07': 9, '10': 10, '100': 11, '100s': 12,
                            '100th': 13, '101': 14, '101st': 15, '102': 16,
                            '103': 17, '105': 18, '106': 19, '10yo': 20,
                            '11': 21, '117': 22, '11706': 23, '11th': 24,
                            '12': 25, '123': 26, '125': 27, '12th': 28,
                            '13': 29, ...})
Модель для классификации - ComplementNB()
Accuracy = 0.7810025594456732
Векторизация - CountVectorizer(vocabulary={'00': 0, '000': 1, '00000001': 2, '000dm': 3,
                            '007': 4, '00am': 5, '0148': 6, '04': 7, '05': 8,
                            '07': 9, '10': 10, '100': 11, '100s': 12,
                            '100th': 13, '101': 14, '101st': 15, '102': 16,
                            '103': 17, 

Наибольшая точность получилась при использовании CountVectorizer и ComplementNB()