In [2]:
import pandas as pd
import numpy as np
import json
import codecs
from nltk.corpus import stopwords
import preprocessing_tools as pr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_val_score,KFold

## Два класса: positive, negative

## Три класса: positive, neutral, negative

In [3]:
train = pd.read_json('train.json',encoding = 'UTF-8')
# Заменяем позитивный маркер на 1, негативный на -1, нейтральный на 0
train['sentiment'] = train['sentiment'].replace(['positive' ,'neutral','negative'],[1,0,-1])
# задаем целевую переменную
target = train['sentiment']
test_data = pd.read_json('test.json',encoding = 'UTF-8')

In [3]:
print(len(train))
print(len(train[train.sentiment == 1]))
print(len(train[train.sentiment == -1]))
print(len(train[train.sentiment == 0]))

8263
2795
1434
4034


In [4]:
train.head()

Unnamed: 0,id,sentiment,text
0,1945,-1,Досудебное расследование по факту покупки ЕНПФ...
1,1957,-1,Медики рассказали о состоянии пострадавшего му...
2,1969,-1,"Прошел почти год, как железнодорожным оператор..."
3,1973,-1,По итогам 12 месяцев 2016 года на территории р...
4,1975,-1,Астана. 21 ноября. Kazakhstan Today - Агентств...


# Очищаем текст и лемматизируем


In [4]:
%reload_ext autoreload
train['text'] = train['text'].apply(pr.clean_text)
test_data['text'] = test_data['text'].apply(pr.clean_text)
train['lemmas'] = train['text'].apply(pr.lemmatization)
test_data['lemmas'] = test_data['text'].apply(pr.lemmatization)

In [6]:
train.head()

Unnamed: 0,id,sentiment,text,lemmas
0,1945,-1,досудебное расследование по факту покупки енпф...,"[досудебный, расследование, факт, покупка, енп..."
1,1957,-1,медики рассказали о состоянии пострадавшего му...,"[медик, состояние, пострадавший, мужчина, сове..."
2,1969,-1,прошел почти год как железнодорожным оператора...,"[железнодорожный, оператор, запретить, эксплуа..."
3,1973,-1,по итогам месяцев года на территории республ...,"[итог, месяц, территория, республика, выпустит..."
4,1975,-1,астана ноября kazakhstan today агентство рк ...,"[астан, kazakhstan, today, агентство, рк, госу..."


# Модель

In [7]:
train['lemmas'] = train['lemmas'].apply(str)
test_data['lemmas'] = test_data['text'].apply(str)

In [8]:
#tfidf_vectorizer = TfidfVectorizer(encoding='cp1251')
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2),encoding='cp1251')

# Logistic Regression

## С использованием лемм

## Без использования лемм

# SVM

## Без использования лемм 

### countvectorizer

In [9]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(train['text'])
X = ngram_vectorizer.transform(train['text'])

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))
    
    
final_svm_ngram = LinearSVC(C=0.01)
final_svm_ngram.fit(X_train, y_train)
print ("Final Accuracy: %s" 
       % accuracy_score(y_val, final_svm_ngram.predict(X_val)))
#print (metrics.classification_report(y_val,final_svm_ngram.predict(X_val)))
print (classification_report(y_val,final_svm_ngram.predict(X_val)))



Accuracy for C=0.01: 0.7032913843175218
Accuracy for C=0.05: 0.6882865440464666
Accuracy for C=0.25: 0.6694094869312681
Accuracy for C=0.5: 0.6684414327202324
Accuracy for C=1: 0.6631171345595354
Final Accuracy: 0.7032913843175218
              precision    recall  f1-score   support

          -1       0.71      0.56      0.63       358
           0       0.70      0.76      0.73       999
           1       0.71      0.69      0.70       709

   micro avg       0.70      0.70      0.70      2066
   macro avg       0.71      0.67      0.69      2066
weighted avg       0.70      0.70      0.70      2066

Wall time: 8min 8s


### TF-IDF

In [10]:
raw_train_and_test_text = train['text'].append(test_data['text'])


In [20]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2),encoding='cp1251',
                                  stop_words=stopwords.words('russian'))
tfidf_vectorizer.fit(train['text'])

X = tfidf_vectorizer.transform(train['text'])

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.66
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))
    
    
final_svm_tfidf = LinearSVC(C=0.5)
final_svm_tfidf.fit(X_train, y_train)
print ("Final Accuracy: %s" 
       % accuracy_score(y_val, final_svm_tfidf.predict(X_val)))
#print (metrics.classification_report(y_val,final_svm_tfidf.predict(X_val)))
print (classification_report(y_val,final_svm_tfidf.predict(X_val), digits = 4))

Accuracy for C=0.01: 0.5192170818505338
Accuracy for C=0.05: 0.6644128113879003
Accuracy for C=0.25: 0.7199288256227758
Accuracy for C=0.5: 0.7306049822064057
Accuracy for C=1: 0.7306049822064057
Final Accuracy: 0.7306049822064057
              precision    recall  f1-score   support

          -1     0.7470    0.5927    0.6609       518
           0     0.7123    0.7867    0.7476      1350
           1     0.7533    0.7261    0.7395       942

   micro avg     0.7306    0.7306    0.7306      2810
   macro avg     0.7375    0.7018    0.7160      2810
weighted avg     0.7324    0.7306    0.7289      2810

Wall time: 24.5 s


## с использованием лемм

### countvectorizer

In [12]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(train['lemmas'])
X = ngram_vectorizer.transform(train['lemmas'])

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.66
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))
    
    
final_svm_ngram = LinearSVC(C=0.01)
final_svm_ngram.fit(X_train, y_train)
print ("Final Accuracy: %s" 
       % accuracy_score(y_val, final_svm_ngram.predict(X_val)))
#print (metrics.classification_report(y_val,final_svm_ngram.predict(X_val)))
print (classification_report(y_val,final_svm_ngram.predict(X_val)))

Accuracy for C=0.01: 0.6921708185053381
Accuracy for C=0.05: 0.6846975088967971
Accuracy for C=0.25: 0.6701067615658363
Accuracy for C=0.5: 0.6669039145907474
Accuracy for C=1: 0.6647686832740214
Final Accuracy: 0.6921708185053381
              precision    recall  f1-score   support

          -1       0.71      0.57      0.63       484
           0       0.70      0.73      0.71      1376
           1       0.68      0.69      0.69       950

   micro avg       0.69      0.69      0.69      2810
   macro avg       0.69      0.67      0.68      2810
weighted avg       0.69      0.69      0.69      2810

Wall time: 4min 24s


### TF-IDF

In [13]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2),encoding='cp1251', 
                                   stop_words=stopwords.words('russian'))
#ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
tfidf_vectorizer.fit(train['lemmas'])
#ngram_vectorizer.fit(train['text'])
X = tfidf_vectorizer.transform(train['lemmas'])

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.66
)

for c in [0.01, 0.05, 0.25, 0.45, 0.5, 0.55, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))
    
    
final_svm_tfidf = LinearSVC(C=0.5, multi_class = 'ovr')
final_svm_tfidf.fit(X_train, y_train)
print ("Final Accuracy: %s" 
       % accuracy_score(y_val, final_svm_tfidf.predict(X_val)))
#print (metrics.classification_report(y_val,final_svm_tfidf.predict(X_val)))
print (classification_report(y_val,final_svm_tfidf.predict(X_val), digits = 4))

Accuracy for C=0.01: 0.5679715302491103
Accuracy for C=0.05: 0.6829181494661922
Accuracy for C=0.25: 0.7185053380782919
Accuracy for C=0.45: 0.7224199288256228
Accuracy for C=0.5: 0.7227758007117437
Accuracy for C=0.55: 0.7227758007117437
Accuracy for C=1: 0.7170818505338078
Final Accuracy: 0.7227758007117437
              precision    recall  f1-score   support

          -1     0.7835    0.6008    0.6801       506
           0     0.7288    0.7627    0.7454      1399
           1     0.6889    0.7293    0.7085       905

   micro avg     0.7228    0.7228    0.7228      2810
   macro avg     0.7338    0.6976    0.7113      2810
weighted avg     0.7258    0.7228    0.7218      2810

Wall time: 20.4 s


### Эксперименты с ядрами SVM

### TF-IDF

### CountVectorizer

## Смешанные подходы

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3), stop_words=stopwords.words('russian'))
ngram_vectorizer.fit(train['text'])
X = ngram_vectorizer.transform(train['text'])


X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.001, 0.005, 0.01, 0.05, 0.1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))
    
    
final = LinearSVC(C=0.01)
final.fit(X_train, y_train)
print ("Final Accuracy: %s" 
       % accuracy_score(y_val, final.predict(X_val)))
print (classification_report(y_val,final_svm_tfidf.predict(X_val)))

Accuracy for C=0.001: 0.7071636011616651
Accuracy for C=0.005: 0.7047434656340755
Accuracy for C=0.01: 0.702323330106486
Accuracy for C=0.05: 0.6960309777347532
Accuracy for C=0.1: 0.691190706679574
Final Accuracy: 0.702323330106486


NameError: name 'final_svm_tfidf' is not defined