In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from nltk.stem.snowball import SnowballStemmer

# Загрузка данных

In [2]:
folder = 'aclImdb'
labels = {'pos':1, 'neg':0}
cur_dir = os.path.dirname(os.path.realpath('__file__'))
folder_path = os.path.join(cur_dir, folder)

In [3]:
def create_dataset_file(folder_name):
    rev_list = []
    label_list = []
    for label in labels.keys():
        path_ls = os.path.join(folder_path, folder_name, label)
        path_list = os.listdir(path_ls)
        for i in range(0, len(path_list)):
            path = os.path.join(path_ls, path_list[i])
            f = open(path, 'r')
            rev_list.append(f.read())
            if (label == 'pos'):
                label_list.append(1)
            else:
                label_list.append(0)
    dataset_list = list(zip(label_list, rev_list))
    df = pd.DataFrame(dataset_list, columns = ["label", "review"])
    df = df.sample(frac=1).reset_index(drop=True)
    df.to_csv(f'data/imdb_{folder_name}.csv', index=False, encoding='utf-8')

In [4]:
create_dataset_file('train')
create_dataset_file('test')

In [5]:
df_train = pd.read_csv('data/imdb_train.csv', encoding='utf-8')

In [6]:
df_train.head(10)

Unnamed: 0,label,review
0,1,This was usually producer Alexander Korda's ad...
1,1,I love this movie/short thing. Jason Steele is...
2,0,How good is Gwyneth Paltrow! This is the right...
3,0,I was expecting a B-Movie French musical. Afte...
4,1,This film stands head and shoulders above the ...
5,0,I guess I was attracted to this film both beca...
6,0,"This is the worst movie I have ever seen, and ..."
7,1,Personal taste rules when it comes to talking ...
8,1,The Woman in Black (1989) is a TV adaptation o...
9,1,"It was life-changing, IT REALLY WAS!!!The Man ..."


In [7]:
df_train.tail(10)

Unnamed: 0,label,review
24990,1,This movie was a fairly entertaining comedy ab...
24991,1,"... It even beats the nasty ""raw"". Almost twen..."
24992,1,Very possibly one of the funniest movies in th...
24993,1,"""Dressed to Kill"" is Brian DePalma's best film..."
24994,1,I have seen the freebird movie and think its g...
24995,0,The teasers for Tree of Palme try to pass it o...
24996,0,I got all excited when I saw the ads for this ...
24997,1,The late 30s and early 40s were a golden age f...
24998,0,Be warned! <br /><br />This is crap that other...
24999,1,"This, like Murder She Wrote, is one of those s..."


In [8]:
df_test = pd.read_csv('data/imdb_test.csv', encoding='utf-8')

In [9]:
df_test.head(10)

Unnamed: 0,label,review
0,1,"From the start of ""The Edge Of Love"", the view..."
1,1,this is a TV movie based on the murder of Mart...
2,1,If this film doesn't at least be selected for ...
3,1,The DVD jacket in which this movie came descri...
4,1,(WARNING - CONTAINS MILD SPOILER) A movie almo...
5,0,"After some quite OK Dutch action flicks, like ..."
6,0,"Incredibly, ""Vampire Assassin"" is significantl..."
7,1,"""D.O.A"" is an involving and entertaining littl..."
8,0,I was REALLY disappointed with this movie. I h...
9,0,"OK, so this is horror? I get horror - but I do..."


In [10]:
df_test.tail(10)

Unnamed: 0,label,review
24990,1,This was one of my favorites as a child. My fa...
24991,0,I had been wanting to see An American Werewolf...
24992,1,A film that deserved theatrical release. This ...
24993,1,Fun movie! Great for the kids - they found it ...
24994,1,are you crazy or what? this movie has talent w...
24995,0,"To be fair, I didn't see a lot of this show. P..."
24996,1,The genius that is Stephen Sondheim was never ...
24997,0,I still wonder why I sat through this entire t...
24998,1,I had heard interesting critics on this movie....
24999,0,Rented this tonite from my local video store. ...


# Подготовка данных и функций

In [11]:
f = open('data/stopwords.txt', 'r')
stopwords_list = f.readlines()
for i in range(len(stopwords_list)):
    stopwords_list[i] = stopwords_list[i].replace('\n', '')
stopwords = set(stopwords_list)

In [12]:
stemmer = SnowballStemmer("english")
def preprocess(txt):
    txt = re.sub('<[^>]*>', '', txt)
    emots = re.findall(r'(?::|;|=) (?:-)?(?:\)|\(|D|P)', txt)
    txt = (re.sub(r'[\W]+', ' ', txt.lower()) + ' '.join(emots).replace('-', ''))

    txt = ' '.join([stemmer.stem(word) for word in txt.split()])

    txt = ' '.join([word for word in txt.split() if word not in stopwords])
    
    return txt

In [13]:
df_train['review'] = df_train['review'].apply(preprocess)
df_test['review'] = df_test['review'].apply(preprocess)

In [14]:
x_train = df_train['review']
y_train = df_train['label']
x_test = df_test['review']
y_test = df_test['label']

In [15]:
tfidf = TfidfVectorizer(
    strip_accents=None, 
    lowercase=False, 
    preprocessor=None, 
    ngram_range = (1, 1), 
    stop_words = None, 
    tokenizer = None
)

x_train = tfidf.fit_transform(x_train)
x_test = tfidf.transform(x_test)

In [16]:
filename_tfidf = 'models/tfidf_vectorizer.pkl'
pickle.dump(tfidf, open(filename_tfidf, 'wb')) 

# Logistic Regression

In [17]:
LogisticRegression().get_params().keys()

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'])

In [18]:
param_grid_lr = {
    'penalty': ['l2'],
    'C': [1.0, 10.0, 100.0],
}

In [19]:
lr= LogisticRegression(random_state=0)
lr_gs = GridSearchCV(estimator=lr, param_grid=param_grid_lr, cv=5, verbose=2, n_jobs=-1)
lr_gs.fit(x_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [20]:
print(f'Лучший набор параметров: {lr_gs.best_params_}')
print(f'Правильность при перекрестной проверке: {lr_gs.best_score_}')
lr_model = lr_gs.best_estimator_
print(f'Правильность на тесте: {lr_model.score(x_test, y_test)}')

Лучший набор параметров: {'C': 10.0, 'penalty': 'l2'}
Правильность при перекрестной проверке: 0.8866799999999999
Правильность на тесте: 0.8692


In [21]:
lr_model.fit(x_train, y_train)

In [22]:
cv_scores_lr = cross_val_score(lr_model, x_train, y_train)
print("Средняя точность кросс-валидации:", np.mean(cv_scores_lr))

Средняя точность кросс-валидации: 0.8866799999999999


In [23]:
predictions_lr = lr_model.predict(x_test)
accuracy_lr = accuracy_score(y_test, predictions_lr)
print("Точность на тестовом множестве:", accuracy_lr)

Точность на тестовом множестве: 0.8692


In [24]:
print(classification_report(y_test, predictions_lr))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87     12500
           1       0.88      0.86      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000



In [25]:
filename_lr = 'models/lr_model.pkl'
pickle.dump(lr_model, open(filename_lr, 'wb')) 

In [26]:
loaded_lr_model = pickle.load(open(filename_lr, 'rb'))

In [27]:
review_text = 'Worst film ever!'
review_text = preprocess(review_text)
review_vector = tfidf.transform([review_text])
pred = loaded_lr_model.predict(review_vector)
prob = loaded_lr_model.predict_proba(review_vector)[0, 1]
print('Негативный обзор') if pred[0] == 0 else print('Позитивный обзор')
rating = round(prob * 9 + 1, 2)
print('Примерная оценка -', rating, 'из 10')

Негативный обзор
Примерная оценка - 1.0 из 10


# Random Forest Classifier

In [28]:
RandomForestClassifier().get_params().keys()

dict_keys(['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'monotonic_cst', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [29]:
param_grid_rf = {
    'n_estimators': [300, 400],
    'min_samples_split': [4, 6],
    'criterion': ['gini', 'entropy', 'log_loss']
}

In [30]:
rf = RandomForestClassifier(random_state=0)
rf_gs = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, verbose=2, n_jobs=-1)
rf_gs.fit(x_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END .................................C=10.0, penalty=l2; total time=   1.4s
[CV] END criterion=gini, min_samples_split=4, n_estimators=400; total time= 2.6min




[CV] END ..................................C=1.0, penalty=l2; total time=   1.5s
[CV] END criterion=gini, min_samples_split=4, n_estimators=400; total time= 2.6min
[CV] END ..................................C=1.0, penalty=l2; total time=   1.0s
[CV] END criterion=gini, min_samples_split=4, n_estimators=300; total time= 2.0min
[CV] END criterion=gini, min_samples_split=6, n_estimators=400; total time= 2.4min
[CV] END ................................C=100.0, penalty=l2; total time=   1.0s
[CV] END criterion=gini, min_samples_split=4, n_estimators=300; total time= 1.9min
[CV] END criterion=gini, min_samples_split=6, n_estimators=400; total time= 2.5min
[CV] END ..................................C=1.0, penalty=l2; total time=   1.2s
[CV] END criterion=gini, min_samples_split=4, n_estimators=300; total time= 1.9min
[CV] END criterion=gini, min_samples_split=6, n_estimators=400; total time= 2.5min
[CV] END .................................C=10.0, penalty=l2; total time=   1.3s
[CV] END crite

  _data = np.array(data, dtype=dtype, copy=copy,


In [34]:
print(f'Лучший набор параметров: {rf_gs.best_params_}')
print(f'Правильность при перекрестной проверке: {rf_gs.best_score_}')
rf_model = rf_gs.best_estimator_
print(f'Правильность на тесте: {rf_model.score(x_test, y_test)}')

Лучший набор параметров: {'criterion': 'entropy', 'min_samples_split': 4, 'n_estimators': 400}
Правильность при перекрестной проверке: 0.858
Правильность при испытании: 0.861


In [35]:
rf_model.fit(x_train, y_train)

In [37]:
cv_scores_rf = cross_val_score(rf_model, x_train, y_train)
print("Средняя точность кросс-валидации:", np.mean(cv_scores_rf))

Средняя точность кросс-валидации: 0.8581199999999999


In [38]:
predictions_rf = rf_model.predict(x_test)
accuracy_rf = accuracy_score(y_test, predictions_rf)
print("Точность на тестовом множестве:", accuracy_rf)

Точность на тестовом множестве: 0.86116


In [39]:
print(classification_report(y_test, predictions_rf))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86     12500
           1       0.86      0.87      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000



In [40]:
filename_rf = 'models/rf_model.pkl'
pickle.dump(rf_model, open(filename_rf, 'wb')) 

# Linear Support Vector Classification

In [41]:
LinearSVC().get_params().keys()

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'loss', 'max_iter', 'multi_class', 'penalty', 'random_state', 'tol', 'verbose'])

In [42]:
param_grid_lsvc = {
    'C': [1.0, 10.0, 100.0],
    'penalty': ['l1', 'l2'],
    'loss': ['hinge', 'squared_hinge'],
}

In [43]:
lsvc = LinearSVC()
lsvc_gs = GridSearchCV(estimator=lsvc, param_grid=param_grid_lsvc, cv=5, verbose=2, n_jobs=-1)
lsvc_gs.fit(x_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


15 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/home/flayven/anaconda3/envs/mmcs/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/flayven/anaconda3/envs/mmcs/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/flayven/anaconda3/envs/mmcs/lib/python3.12/site-packages/sklearn/svm/_classes.py", line 317, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
            

In [44]:
print(f'Лучший набор параметров: {lsvc_gs.best_params_}')
print(f'Правильность при перекрестной проверке: {lsvc_gs.best_score_}')
lsvc_model = lsvc_gs.best_estimator_
print(f'Правильность на тесте: {lsvc_model.score(x_test, y_test)}')

Лучший набор параметров: {'C': 1.0, 'loss': 'hinge', 'penalty': 'l2'}
Правильность при перекрестной проверке: 0.888
Правильность при испытании: 0.874


In [45]:
lsvc_model.fit(x_train, y_train)

In [46]:
cv_scores_lsvc = cross_val_score(lsvc_model, x_train, y_train)
print("Средняя точность кросс-валидации:", np.mean(cv_scores_lsvc))

Средняя точность кросс-валидации: 0.8883199999999999


In [47]:
predictions_lsvc = lsvc_model.predict(x_test)
accuracy_lsvc = accuracy_score(y_test, predictions_lsvc)
print("Точность на тестовом множестве:", accuracy_lsvc)

Точность на тестовом множестве: 0.8736


In [48]:
print(classification_report(y_test, predictions_lsvc))

              precision    recall  f1-score   support

           0       0.87      0.88      0.87     12500
           1       0.88      0.86      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000



In [49]:
filename_lsvc = 'models/lsvc_model.pkl'
pickle.dump(lsvc_model, open(filename_lsvc, 'wb')) 

# Multinomial Naive Bayes

In [50]:
MultinomialNB().get_params().keys()

dict_keys(['alpha', 'class_prior', 'fit_prior', 'force_alpha'])

In [51]:
mnb_model = MultinomialNB()
mnb_model.fit(x_train, y_train)

In [52]:
cv_scores_mnb = cross_val_score(mnb_model, x_train, y_train)
print("Средняя точность кросс-валидации:", np.mean(cv_scores_mnb))

Средняя точность кросс-валидации: 0.86208


In [53]:
predictions_mnb = mnb_model.predict(x_test)
accuracy_mnb = accuracy_score(y_test, predictions_mnb)
print("Точность на тестовом множестве:", accuracy_mnb)

Точность на тестовом множестве: 0.82128


In [54]:
print(classification_report(y_test, predictions_mnb))

              precision    recall  f1-score   support

           0       0.79      0.87      0.83     12500
           1       0.86      0.77      0.81     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000



In [55]:
filename_lr = 'models/mnb_model.pkl'
pickle.dump(lr_model, open(filename_lr, 'wb')) 

# Stochastic Gradient Descent Classifier

In [56]:
SGDClassifier().get_params().keys()

dict_keys(['alpha', 'average', 'class_weight', 'early_stopping', 'epsilon', 'eta0', 'fit_intercept', 'l1_ratio', 'learning_rate', 'loss', 'max_iter', 'n_iter_no_change', 'n_jobs', 'penalty', 'power_t', 'random_state', 'shuffle', 'tol', 'validation_fraction', 'verbose', 'warm_start'])

In [57]:
param_grid_sgd = {
    'loss': ['hinge', 'log_loss', 'modified_huber', 'squared_hinge'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': [0.00001, 0.0001, 0.001, 0.01],
    'max_iter': [10000]
}

In [58]:
sgd = SGDClassifier()
sgd_gs = GridSearchCV(estimator=sgd, param_grid=param_grid_sgd, cv=5, verbose=2, n_jobs=-1)
sgd_gs.fit(x_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END ......................C=1.0, loss=hinge, penalty=l2; total time=   0.7s
[CV] END .....................C=10.0, loss=hinge, penalty=l1; total time=   0.0s
[CV] END .....................C=10.0, loss=hinge, penalty=l2; total time=   2.4s
[CV] END .............C=10.0, loss=squared_hinge, penalty=l2; total time=   1.7s
[CV] END ....................C=100.0, loss=hinge, penalty=l2; total time=   2.9s
[CV] END ............C=100.0, loss=squared_hinge, penalty=l2; total time=   4.8s
[CV] END ............C=100.0, loss=squared_hinge, penalty=l2; total time=   4.9s
[CV] END alpha=1e-05, loss=hinge, max_iter=10000, penalty=l2; total time=   0.4s
[CV] END alpha=1e-05, loss=hinge, max_iter=10000, penalty=elasticnet; total time=   0.9s
[CV] END alpha=1e-05, loss=modified_huber, max_iter=10000, penalty=l2; total time=   0.3s
[CV] END alpha=1e-05, loss=modified_huber, max_iter=10000, penalty=elasticnet; total time=   0.5s
[CV] END alph



In [59]:
print(f'Лучший набор параметров: {sgd_gs.best_params_}')
print(f'Правильность при перекрестной проверке: {sgd_gs.best_score_}')
sgd_model = sgd_gs.best_estimator_
print(f'Правильность на тесте: {sgd_model.score(x_test, y_test)}')

Лучший набор параметров: {'alpha': 0.0001, 'loss': 'hinge', 'max_iter': 10000, 'penalty': 'l2'}
Правильность при перекрестной проверке: 0.889
Правильность при испытании: 0.880


In [60]:
sgd_model.fit(x_train, y_train)

In [61]:
cv_scores_sgd = cross_val_score(sgd_model, x_train, y_train)
print("Средняя точность кросс-валидации:", np.mean(cv_scores_sgd))

Средняя точность кросс-валидации: 0.8894400000000001


In [62]:
predictions_sgd = sgd_model.predict(x_test)
accuracy_sgd = accuracy_score(y_test, predictions_sgd)
print("Точность на тестовом множестве:", accuracy_sgd)

Точность на тестовом множестве: 0.88024


In [63]:
print(classification_report(y_test, predictions_sgd))

              precision    recall  f1-score   support

           0       0.88      0.88      0.88     12500
           1       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [65]:
filename_sgd = 'models/sgd_model.pkl'
pickle.dump(sgd_model, open(filename_sgd, 'wb')) 

[CV] END ......................C=1.0, loss=hinge, penalty=l2; total time=   0.6s
[CV] END ..............C=1.0, loss=squared_hinge, penalty=l2; total time=   0.5s
[CV] END .....................C=10.0, loss=hinge, penalty=l2; total time=   2.1s
[CV] END .............C=10.0, loss=squared_hinge, penalty=l2; total time=   1.8s
[CV] END ....................C=100.0, loss=hinge, penalty=l2; total time=   2.7s
[CV] END ............C=100.0, loss=squared_hinge, penalty=l1; total time=  14.7s
[CV] END alpha=1e-05, loss=hinge, max_iter=10000, penalty=elasticnet; total time=   0.9s
[CV] END alpha=1e-05, loss=log_loss, max_iter=10000, penalty=elasticnet; total time=   0.4s
[CV] END alpha=1e-05, loss=modified_huber, max_iter=10000, penalty=l2; total time=   0.2s
[CV] END alpha=1e-05, loss=modified_huber, max_iter=10000, penalty=elasticnet; total time=   0.5s
[CV] END alpha=1e-05, loss=squared_hinge, max_iter=10000, penalty=l1; total time= 3.6min
[CV] END ......................C=1.0, loss=hinge, penalt