In [11]:
import pandas as pd
import numpy as np
import pymorphy2
import nltk
import stop_words
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.model_selection import KFold, GridSearchCV, train_test_split, cross_val_score, StratifiedKFold, ParameterGrid
from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from datetime import date
import fastnumbers
import re
from fastnumbers import isfloat, isint

In [12]:
data = pd.read_csv('../ucu_sentiment-master/data/train_set.csv', usecols=range(1,11),  parse_dates=['timestamp', 'thread_timestamp'])

In [84]:
data.head

<bound method NDFrame.head of              msg_id    user_id    channel           timestamp  \
15251    7002610800  U040HKJE7  _meetings 2015-03-22 11:41:48   
15252    7002629100  U040HKJE7  _meetings 2015-03-22 11:44:51   
15253    7002632300  U040HKJE7  _meetings 2015-03-22 11:45:23   
15254    7002650700  U040HKJE7  _meetings 2015-03-22 11:48:27   
15255    7002654400  U040HKJE7  _meetings 2015-03-22 11:49:04   
15256    7002718700  U041T0UHM  _meetings 2015-03-22 11:59:47   
15257    7002939400  U041LH06L  _meetings 2015-03-22 12:36:34   
15258    7003686500  U040M0W0S  _meetings 2015-03-22 14:41:05   
15259    7003768000  U041LH06L  _meetings 2015-03-22 14:54:40   
15260    7003772600  U041LH06L  _meetings 2015-03-22 14:55:26   
15261    7003800800  U040HKJE7  _meetings 2015-03-22 15:00:08   
15262    7004398700  U041P485A  _meetings 2015-03-22 16:39:47   
15263    7004402500  U040M0W0S  _meetings 2015-03-22 16:40:25   
15264    7004414300  U040HKJE7  _meetings 2015-03-22 16:42:2

In [13]:
#select top 12 channels
data =data[data.channel.isin(['career', 'big_data', 'deep_learning', 'kaggle_crackers', 
           'lang_python',  'lang_r', 'nlp', 'theory_and_practice', 'welcome', 'bayesian', '_meetings', 'datasets']) 
           & data.main_msg]

In [14]:
# make labels for channels
mappings = {
    'career': 0,
    'theory_and_practice': 1,
    'deep_learning': 2,
    'lang_python': 3,
    '_meetings': 4,
    'kaggle_crackers': 5,
    'big_data': 6,
    'lang_r': 7,
    'nlp': 8,
    'welcome': 9,
    'datasets': 10,
    'bayesian': 11
}

In [15]:
# split on data and data val
date_before = date(2017, 4, 1)
train = data[data['timestamp'] <= date_before]
val = data[data['timestamp'] > date_before]

In [29]:
train_data = train[['channel', 'text']].reset_index()[['channel', 'text']]
train_data['channel'] = train_data.channel.map(mappings)
train_data = train_data.sort_values('channel').reset_index()[['channel', 'text']]

val_data = val[['channel', 'text']].reset_index()[['channel', 'text']]
val_data['channel'] = val_data.channel.map(mappings)
val_data = val_data.sort_values('channel').reset_index()[['channel', 'text']]

train_data.text = train_data.text.astype(str)\
    .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
    .apply(lambda x: re.sub('\s+', ' ', x))
train_data = train_data[~train_data.text.apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]

val_data.text = val_data.text.astype(str)\
    .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
    .apply(lambda x: re.sub('\s+', ' ', x))
val_data = val_data[~val_data.text.apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]

train_text = train_data['text'].astype(str).apply(lambda x: x.lower())
train_labels =  np.asarray(train_data['channel'], dtype='int8')

val_text = val_data['text'].astype(str).apply(lambda x: x.lower())
val_labels = np.asarray(val_data['channel'], dtype='int8')

In [32]:
# library for lemmatization and stop_words
morph = pymorphy2.MorphAnalyzer()

In [33]:
# exclude spec symbols
train_text = train_text\
    .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x))\
    .apply(lambda x: re.sub('\s+', ' ', x))
    
val_text = val_text\
    .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x))\
    .apply(lambda x: re.sub('\s+', ' ', x))

In [34]:
# without tuning accuracy_score = 50.81%
# accuracy_score = 55.33%
classifier = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer = 'char', max_features = 1000000, 
                                                       ngram_range = (1, 4))),
    ('clf', OneVsRestClassifier(LogisticRegression(),n_jobs=-1))])

In [35]:
classifier.fit(train_text, train_labels)

Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000000, min_df=1,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf...None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=-1))])

In [37]:
predicted = classifier.predict(val_text)

In [38]:
accuracy_score(predicted, val_labels)

0.55334281650071127

In [78]:
categories = ['career',  'theory_and_practice','deep_learning', 'lang_python', '_meetings', 'kaggle_crackers', 'big_data','lang_r',
               'nlp', 'welcome','datasets','bayesian']

In [86]:
te = TextExplainer(random_state=42)
te.fit('запустил спарк выбило такую ошибку', classifier.predict_proba)
te.show_prediction(target_names=list(categories))

Contribution?,Feature
-0.61,<BIAS>
-3.857,Highlighted in text (sum)

Contribution?,Feature
-0.661,<BIAS>
-1.266,Highlighted in text (sum)

Contribution?,Feature
-0.674,<BIAS>
-1.551,Highlighted in text (sum)

Contribution?,Feature
-0.688,<BIAS>
-2.136,Highlighted in text (sum)

Contribution?,Feature
-0.656,<BIAS>
-4.707,Highlighted in text (sum)

Contribution?,Feature
-0.737,<BIAS>
-1.893,Highlighted in text (sum)

Contribution?,Feature
0.41,Highlighted in text (sum)
-0.766,<BIAS>

Contribution?,Feature
-0.791,<BIAS>
-2.937,Highlighted in text (sum)

Contribution?,Feature
-0.682,<BIAS>
-5.72,Highlighted in text (sum)

Contribution?,Feature
-0.653,<BIAS>
-7.837,Highlighted in text (sum)

Contribution?,Feature
-0.698,<BIAS>
-6.355,Highlighted in text (sum)

Contribution?,Feature
-0.661,<BIAS>
-6.746,Highlighted in text (sum)


In [64]:
classifier.predict(['почему поламался хадуп'])

array([6], dtype=int8)

In [39]:
classifier.predict(val_text)

array([0, 0, 0, ..., 3, 0, 1], dtype=int8)

In [42]:
predicted_val = classifier.decision_function(val_text)

In [53]:
predicted_val.shape

(7733, 12)

In [43]:
softmax = np.exp(predicted_val) / np.sum(np.exp(predicted_val), axis=1)[:, np.newaxis]

In [55]:
np.save('linear_class.npy', softmax)

In [54]:
softmax

array([[ 0.47408063,  0.08195074,  0.02679255, ...,  0.00900392,
         0.00691329,  0.00689136],
       [ 0.90294685,  0.02291404,  0.01832445, ...,  0.00795017,
         0.00506998,  0.00122345],
       [ 0.73641678,  0.06140345,  0.04213868, ...,  0.00574367,
         0.00385626,  0.00282096],
       ..., 
       [ 0.00112408,  0.03524314,  0.04878854, ...,  0.00334064,
         0.00206787,  0.01994189],
       [ 0.4080199 ,  0.03013902,  0.2927247 , ...,  0.0352853 ,
         0.00703018,  0.00693173],
       [ 0.00203772,  0.86526257,  0.02555515, ...,  0.0059855 ,
         0.00435519,  0.02160922]])

In [44]:
len(softmax)

7733

In [46]:
# sun in row = 1 ( prediction)
softmax

array([[ 0.47408063,  0.08195074,  0.02679255, ...,  0.00900392,
         0.00691329,  0.00689136],
       [ 0.90294685,  0.02291404,  0.01832445, ...,  0.00795017,
         0.00506998,  0.00122345],
       [ 0.73641678,  0.06140345,  0.04213868, ...,  0.00574367,
         0.00385626,  0.00282096],
       ..., 
       [ 0.00112408,  0.03524314,  0.04878854, ...,  0.00334064,
         0.00206787,  0.01994189],
       [ 0.4080199 ,  0.03013902,  0.2927247 , ...,  0.0352853 ,
         0.00703018,  0.00693173],
       [ 0.00203772,  0.86526257,  0.02555515, ...,  0.0059855 ,
         0.00435519,  0.02160922]])

In [71]:
classifier.predict(['как у дата санqтиста зарплата в германии'])

array([0], dtype=int8)

In [48]:
classifier.predict(['какие лучшие подходы для сентимент анализа'])

array([1], dtype=int8)

In [62]:
classifier.predict(['почему поламался хадуп'])

array([6], dtype=int8)

In [50]:
classifier.predict(['кто-то принимает участет в соревоаниях'])

array([5], dtype=int8)

In [51]:
# matrix error
pd.crosstab(predicted, val_labels, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0,1,2,3,4,5,6,7,8,9,10,11,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,1661,113,190,63,26,596,37,17,53,12,22,11,2801
1,89,515,167,149,6,269,45,28,85,2,29,28,1412
2,67,80,620,75,6,245,23,13,58,4,19,13,1223
3,28,105,100,524,3,73,35,31,24,1,6,3,933
4,61,12,24,9,95,71,5,6,3,3,6,2,297
5,19,18,16,9,0,379,1,4,8,0,5,1,460
6,8,4,0,5,0,6,113,4,0,0,0,1,141
7,1,3,4,6,0,3,1,54,1,0,0,0,73
8,3,4,5,5,0,2,1,1,98,0,13,0,132
9,10,5,5,0,3,1,0,1,4,190,0,0,219
