In [116]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.model_selection import  KFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import random

In [None]:
df = pd.read_csv('IMDB Dataset.csv')
print(df.keys())

В силу того, что датасет очень большой, оставим в нем всего 2000 наблюдений, чтобы быстрее увидеть результат.

In [126]:
sample_index = random.sample(population=range(len(df)),k=2000)

In [129]:
drop_index = filter(lambda i: i not in sample_index, range(len(df)))
new_df = df.drop(drop_index)

In [135]:
Reviews = new_df['review']
y = new_df['sentiment'].map(lambda sent: int(sent=='positive'))

In [None]:
#import nltk
#nltk.download()

In [None]:
#from nltk.corpus import stopwords
#sw_list = list(stopwords.words('english'))

In [162]:
import string

def clean(text):
    text = text.replace('<', ' ')
    text = text.replace('.', ' ')
    clean_text = ''
    for word in text.lower().split():
        if word != 'br':
            clean_text+= ' ' + word
    clean_text = clean_text.translate(str.maketrans('', '', string.punctuation))
    return clean_text

clean_Reviews = Reviews.map(clean)
clean_Reviews[sample_index[0]]

' tremendous fun both as a film and as an excuse to sit back and play the oh thats whassis name game every star of the golden age of english films seems to be in this one and it was a joy to see them and the greatest of them all richard wattis was as tremendous as ever  there is actually a plot that trundles along very nicely theres also some splendid jokes and comedic moments but the key to this films triumph is the characters within it alastair sim is magnificent and somehow convinces you that a six foot bigboned scotsman could be the headmistress of a girls boarding school george cole beryl reid and irene handl all have their moments but with alastair and richard the star of the show is joyce grenfell she is an absolute oneoff and brings a smile whenever shes on the screen her rollingwalk and plum accent done to perfection  and for those playing the  thats whassis name game you can even spot arthur mullard barbara windsor and ronald searle if you look carefully'

In [163]:
print(Reviews[sample_index[0]])

Tremendous fun both as a film and as an excuse to sit back and play the 'oh, that's whassis name' game. Every star of the golden age of English films seems to be in this one and it was a joy to see them. And the greatest of them all, Richard Wattis, was as tremendous as ever. <br /><br />There is actually a plot that trundles along very nicely, there's also some splendid jokes and comedic moments, but the key to this films triumph is the characters within it. Alastair Sim is magnificent and somehow convinces you that a six foot, big-boned Scotsman could be the headmistress of a girl's boarding school. George Cole, Beryl Reid and Irene Handl all have their moments but, with Alastair and Richard, the star of the show is Joyce Grenfell. She is an absolute one-off and brings a smile whenever she's on the screen...her rolling-walk and plum accent done to perfection. <br /><br />And for those playing the , that's whassis name' game, you can even spot Arthur Mullard, Barbara Windsor and Ronal

In [139]:
count_vec = CountVectorizer()
count_Reviews = count_vec.fit_transform(clean_Reviews)

In [140]:
tfidf_vec = TfidfVectorizer()
tfidf_Reviews = tfidf_vec.fit_transform(clean_Reviews)

In [151]:
random_state = 241
grid = {'C': np.power(10.0, np.arange(-5, 5))}
cv = KFold(n_splits=5, shuffle=True, random_state=random_state)
clf = SVC(kernel='linear', random_state=random_state)
gs = sklearn.model_selection.GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
gs.fit(tfidf_Reviews, y)


GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=241, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [152]:
C = gs.best_params_['C']
print(gs.best_params_)

{'C': 1.0}


In [153]:
svm = SVC(C=C, kernel='linear', random_state=random_state)
svm.fit(tfidf_Reviews,y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=241, shrinking=True, tol=0.001,
    verbose=False)

In [None]:
def nmax(lst, n):
    res = []
    count = 0
    while count < n:
        current_max = 0
        indmax = 0
        for k in range(len(lst)):
            if lst[k] > current_max:
                current_max = lst[k]
                indmax = k
        res.append(indmax)
        lst = np.delete(lst,indmax)
        count+=1
    return res

Выведем первые 20 положительных и 20 отрицательных ключевых слов

In [182]:
coefs = svm.coef_.toarray()
max_20 = nmax(coefs[0],20)
min_20 = nmax(-coefs[0],20)
feature_names = tfidf_vec.get_feature_names()
print('Positive key-words:',sorted([feature_names[i] for i in max_20]))
print('Negative key-words:',sorted([feature_names[i] for i in min_20]))

Positive key-words: ['always', 'ancientextinct', 'beaucoup', 'best', 'bishops', 'comedic', 'esoterically', 'excellent', 'fauxmetal', 'great', 'humiliated', 'joan', 'lifesized', 'likability', 'lovable', 'lovecraftand', 'perception', 'showas', 'tmo', 'yorkshire']
Negative key-words: ['annoying', 'awful', 'bad', 'bores', 'edinburgh', 'eve', 'interceptors', 'lookalike', 'nivola', 'notes', 'oneword', 'orgdetailsthiefinthenight', 'pooljacuzzi', 'stunned', 'termination', 'washingtons', 'waybackwhen', 'wholesale', 'worst', 'worstcase']
