In [1]:
import pandas as pd
import numpy as np
import json
import codecs
from nltk.corpus import stopwords
import preprocessing_tools as pr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_val_score,KFold

## Данные

In [2]:
train = pd.read_json('train.json',encoding = 'UTF-8')
# Заменяем позитивный маркер на 1, негативный на -1, нейтральный на 0
train['sentiment'] = train['sentiment'].replace(['positive' ,'neutral','negative'],[1,0,-1])
# задаем целевую переменную
target = train['sentiment']
test_data = pd.read_json('test.json',encoding = 'UTF-8')

In [3]:
print(len(train))
print(len(train[train.sentiment == 1]))
print(len(train[train.sentiment == -1]))
print(len(train[train.sentiment == 0]))

8263
2795
1434
4034


In [4]:
train.head()

Unnamed: 0,id,sentiment,text
0,1945,-1,Досудебное расследование по факту покупки ЕНПФ...
1,1957,-1,Медики рассказали о состоянии пострадавшего му...
2,1969,-1,"Прошел почти год, как железнодорожным оператор..."
3,1973,-1,По итогам 12 месяцев 2016 года на территории р...
4,1975,-1,Астана. 21 ноября. Kazakhstan Today - Агентств...


# Предобработка (очищение и лемматизация)

In [4]:
%reload_ext autoreload
train['text'] = train['text'].apply(pr.clean_text)
test_data['text'] = test_data['text'].apply(pr.clean_text)
train['lemmas'] = train['text'].apply(pr.lemmatization)
test_data['lemmas'] = test_data['text'].apply(pr.lemmatization)

In [6]:
train.head()

Unnamed: 0,id,sentiment,text,lemmas
0,1945,-1,досудебное расследование по факту покупки енпф...,"[досудебный, расследование, факт, покупка, енп..."
1,1957,-1,медики рассказали о состоянии пострадавшего му...,"[медик, состояние, пострадавший, мужчина, сове..."
2,1969,-1,прошел почти год как железнодорожным оператора...,"[железнодорожный, оператор, запретить, эксплуа..."
3,1973,-1,по итогам месяцев года на территории республ...,"[итог, месяц, территория, республика, выпустит..."
4,1975,-1,астана ноября kazakhstan today агентство рк ...,"[астан, kazakhstan, today, агентство, рк, госу..."


In [5]:
train['lemmas'] = train['lemmas'].apply(str)
test_data['lemmas'] = test_data['text'].apply(str)

# Различные модели

## SVM grid search

In [102]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV

text_clf = Pipeline([('vectorizer', TfidfVectorizer()),
                     ('clf', LinearSVC())])

tuned_parameters = {
    'vectorizer__ngram_range': [ (1, 2), (2, 2)],
    'vectorizer__norm': ('l1', 'l2'), 
    'vectorizer__sublinear_tf': (True, False),
    'clf__loss': ('hinge','squared_hinge'), 
}

In [66]:
text_clf.get_params().keys()

dict_keys(['memory', 'steps', 'vectorizer', 'clf', 'vectorizer__analyzer', 'vectorizer__binary', 'vectorizer__decode_error', 'vectorizer__dtype', 'vectorizer__encoding', 'vectorizer__input', 'vectorizer__lowercase', 'vectorizer__max_df', 'vectorizer__max_features', 'vectorizer__min_df', 'vectorizer__ngram_range', 'vectorizer__norm', 'vectorizer__preprocessor', 'vectorizer__smooth_idf', 'vectorizer__stop_words', 'vectorizer__strip_accents', 'vectorizer__sublinear_tf', 'vectorizer__token_pattern', 'vectorizer__tokenizer', 'vectorizer__use_idf', 'vectorizer__vocabulary', 'clf__C', 'clf__class_weight', 'clf__dual', 'clf__fit_intercept', 'clf__intercept_scaling', 'clf__loss', 'clf__max_iter', 'clf__multi_class', 'clf__penalty', 'clf__random_state', 'clf__tol', 'clf__verbose'])

In [103]:
X_train, X_val, y_train, y_val = train_test_split(train['lemmas'], target, test_size=0.33, random_state=42)

In [104]:
%%time

from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
score = 'f1_macro'
print("# Tuning hyper-parameters for %s" % score)
print()
np.errstate(divide='ignore')
cv = StratifiedShuffleSplit(n_splits = 10, test_size = 0.33, random_state = 0)
cv_kfold = StratifiedKFold(n_splits=10, random_state=0)
clf = GridSearchCV(text_clf, tuned_parameters, cv=10, scoring=score)
clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
for mean, std, params in zip(clf.cv_results_['mean_test_score'], 
                             clf.cv_results_['std_test_score'], 
                             clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
print(classification_report(y_val, clf.predict(X_val), digits=4))
print()

# Tuning hyper-parameters for f1_macro



  'precision', 'predicted', average, warn_for)


Best parameters set found on development set:

{'clf__loss': 'hinge', 'vectorizer__ngram_range': (1, 2), 'vectorizer__norm': 'l2', 'vectorizer__sublinear_tf': True}

Grid scores on development set:

0.472 (+/-0.074) for {'clf__loss': 'hinge', 'vectorizer__ngram_range': (1, 2), 'vectorizer__norm': 'l1', 'vectorizer__sublinear_tf': True}
0.468 (+/-0.060) for {'clf__loss': 'hinge', 'vectorizer__ngram_range': (1, 2), 'vectorizer__norm': 'l1', 'vectorizer__sublinear_tf': False}
0.698 (+/-0.055) for {'clf__loss': 'hinge', 'vectorizer__ngram_range': (1, 2), 'vectorizer__norm': 'l2', 'vectorizer__sublinear_tf': True}
0.693 (+/-0.048) for {'clf__loss': 'hinge', 'vectorizer__ngram_range': (1, 2), 'vectorizer__norm': 'l2', 'vectorizer__sublinear_tf': False}
0.436 (+/-0.078) for {'clf__loss': 'hinge', 'vectorizer__ngram_range': (2, 2), 'vectorizer__norm': 'l1', 'vectorizer__sublinear_tf': True}
0.441 (+/-0.070) for {'clf__loss': 'hinge', 'vectorizer__ngram_range': (2, 2), 'vectorizer__norm': 'l1',

In [6]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier

tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=6, norm='l2', ngram_range=(1, 2),encoding='cp1251', 
                                   stop_words=stopwords.words('russian'))

tfidf_vectorizer.fit(train['lemmas'])
X = tfidf_vectorizer.transform(train['lemmas'])

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.66
)

for c in [ 0.45, 0.5, 0.55, 1, 1.05, 1.1,1.15]:
    
    svm = LinearSVC(C=c, multi_class = 'ovr', loss = 'hinge')
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))
    
    
final_svm_tfidf = LinearSVC(C=1, multi_class = 'ovr', loss = 'hinge')
final_svm_tfidf.fit(X_train, y_train)
print ("Final Accuracy: %s" 
       % accuracy_score(y_val, final_svm_tfidf.predict(X_val)))
print (classification_report(y_val,final_svm_tfidf.predict(X_val), digits = 4))



Accuracy for C=0.45: 0.7288256227758008
Accuracy for C=0.5: 0.7295373665480427
Accuracy for C=0.55: 0.7327402135231317
Accuracy for C=1: 0.7355871886120996
Accuracy for C=1.05: 0.7362989323843416
Accuracy for C=1.1: 0.7366548042704626
Accuracy for C=1.15: 0.7380782918149467
Final Accuracy: 0.7355871886120996
              precision    recall  f1-score   support

          -1     0.7531    0.6305    0.6864       479
           0     0.7267    0.7801    0.7525      1360
           1     0.7418    0.7250    0.7333       971

   micro avg     0.7356    0.7356    0.7356      2810
   macro avg     0.7406    0.7119    0.7241      2810
weighted avg     0.7364    0.7356    0.7346      2810

Wall time: 1min 5s


## NB GridSearch

In [94]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

text_clf = Pipeline([('vectorizer', TfidfVectorizer()),
                     ('clf', MultinomialNB())])

tuned_parameters = {
    'vectorizer__ngram_range': [ (1, 2), (2, 2)],
    #'vectorizer__sublinear_tf': (True, False),
    #'vectorizer__norm': ('l1', 'l2'),
    'vectorizer__min_df': (4,5,6),
    'clf__alpha': [1, 1e-1, 1e-2]
}

In [95]:
X_train, X_val, y_train, y_val = train_test_split(train['lemmas'], target, test_size=0.33, random_state=42)

In [97]:
%%time
from sklearn.metrics import classification_report

score = 'f1_macro'
print("# Tuning hyper-parameters for %s" % score)
print()
np.errstate(divide='ignore')
clf = GridSearchCV(text_clf, tuned_parameters, cv=10, scoring=score)
clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
for mean, std, params in zip(clf.cv_results_['mean_test_score'], 
                             clf.cv_results_['std_test_score'], 
                             clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
print(classification_report(y_val, clf.predict(X_val), digits=4))
print()

# Tuning hyper-parameters for f1_macro

Best parameters set found on development set:

{'clf__alpha': 0.01, 'vectorizer__min_df': 5, 'vectorizer__ngram_range': (1, 2)}

Grid scores on development set:

0.518 (+/-0.034) for {'clf__alpha': 1, 'vectorizer__min_df': 4, 'vectorizer__ngram_range': (1, 2)}
0.610 (+/-0.043) for {'clf__alpha': 1, 'vectorizer__min_df': 4, 'vectorizer__ngram_range': (2, 2)}
0.546 (+/-0.036) for {'clf__alpha': 1, 'vectorizer__min_df': 5, 'vectorizer__ngram_range': (1, 2)}
0.616 (+/-0.040) for {'clf__alpha': 1, 'vectorizer__min_df': 5, 'vectorizer__ngram_range': (2, 2)}
0.561 (+/-0.039) for {'clf__alpha': 1, 'vectorizer__min_df': 6, 'vectorizer__ngram_range': (1, 2)}
0.621 (+/-0.040) for {'clf__alpha': 1, 'vectorizer__min_df': 6, 'vectorizer__ngram_range': (2, 2)}
0.677 (+/-0.046) for {'clf__alpha': 0.1, 'vectorizer__min_df': 4, 'vectorizer__ngram_range': (1, 2)}
0.675 (+/-0.046) for {'clf__alpha': 0.1, 'vectorizer__min_df': 4, 'vectorizer__ngram_range': (2, 2)}
0.

In [110]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=6, norm='l2', ngram_range=(1, 2),encoding='cp1251', 
                                   stop_words=stopwords.words('russian'))

tfidf_vectorizer.fit(train['lemmas'])
X = tfidf_vectorizer.transform(train['lemmas'])

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.66
)

for alpha in [0.0001, 0.001, 0.01, 0.05, 0.25, 0.45, 0.5, 0.55, 1, 1.05, 1.1]:
    
    NB = OneVsRestClassifier(MultinomialNB(alpha = alpha, fit_prior=True, class_prior=None))
    NB.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (alpha, accuracy_score(y_val, NB.predict(X_val))))
    
    
final_NB_tfidf = OneVsRestClassifier(MultinomialNB(alpha=0.001, fit_prior=True, class_prior=None))
final_NB_tfidf.fit(X_train, y_train)
print ("Final Accuracy: %s" 
       % accuracy_score(y_val, final_NB_tfidf.predict(X_val)))
print (classification_report(y_val,final_NB_tfidf.predict(X_val), digits = 5))



Accuracy for C=0.0001: 0.7128113879003559
Accuracy for C=0.001: 0.7160142348754448
Accuracy for C=0.01: 0.7120996441281139
Accuracy for C=0.05: 0.706049822064057
Accuracy for C=0.25: 0.7067615658362989
Accuracy for C=0.45: 0.695373665480427
Accuracy for C=0.5: 0.695017793594306
Accuracy for C=0.55: 0.69288256227758
Accuracy for C=1: 0.6633451957295373
Accuracy for C=1.05: 0.6587188612099644
Accuracy for C=1.1: 0.6551601423487544
Final Accuracy: 0.7160142348754448
              precision    recall  f1-score   support

          -1    0.73171   0.60852   0.66445       493
           0    0.71725   0.72530   0.72125      1336
           1    0.70829   0.75739   0.73202       981

   micro avg    0.71601   0.71601   0.71601      2810
   macro avg    0.71908   0.69707   0.70591      2810
weighted avg    0.71666   0.71601   0.71505      2810

Wall time: 17.7 s
