In [3]:
import pandas as pd
import numpy as np
import pymorphy2
import nltk
import stop_words
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.model_selection import KFold, GridSearchCV, train_test_split, cross_val_score, StratifiedKFold, ParameterGrid
from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from datetime import date
import fastnumbers
import re
from fastnumbers import isfloat, isint
import eli5
from eli5.lime import TextExplainer

In [4]:
data = pd.read_csv('../data/train_set.csv', usecols=range(1,11),  parse_dates=['timestamp', 'thread_timestamp'])

In [5]:
data.head()

Unnamed: 0,msg_id,user_id,channel,timestamp,text,main_msg,thread_id,thread_timestamp,replies_count,reactions
0,10648620600,U0KPCJWAC,_call_4_collaboration,2016-05-17 11:30:06,Всем привет! У нас с <@U04BFDYPV> есть идея сд...,1,10648620600,2016-05-17 11:30:06,0,"{'count': 6, 'name': '+1'}"
1,10648631200,U04CH4QBD,_call_4_collaboration,2016-05-17 11:31:52,а почему не писать статьи в <http://ods.ai|ods...,1,10648631200,2016-05-17 11:31:52,0,{}
2,10648645600,U065VP6F7,_call_4_collaboration,2016-05-17 11:34:16,n+1 слишком популярно и без формул?,1,10648645600,2016-05-17 11:34:16,0,{}
3,10648664700,U0KPCJWAC,_call_4_collaboration,2016-05-17 11:37:27,<http://opendatascience.ru|opendatascience.ru>...,1,10648664700,2016-05-17 11:37:27,0,{}
4,10648667500,U04BFDYPV,_call_4_collaboration,2016-05-17 11:37:55,<@U065VP6F7>: Принцип примерно такой: у тебя е...,1,10648667500,2016-05-17 11:37:55,0,{}


In [6]:
#select top 12 channels
data =data[data.channel.isin(['career', 'big_data', 'deep_learning', 'kaggle_crackers', 
           'lang_python',  'lang_r', 'nlp', 'theory_and_practice', 'welcome', 'bayesian', '_meetings', 'datasets']) 
           & data.main_msg]

In [7]:
# make labels for channels
mappings = {
    'career': 0,
    'theory_and_practice': 1,
    'deep_learning': 2,
    'lang_python': 3,
    '_meetings': 4,
    'kaggle_crackers': 5,
    'big_data': 6,
    'lang_r': 7,
    'nlp': 8,
    'welcome': 9,
    'datasets': 10,
    'bayesian': 11
}

In [8]:
# split on data and data val
date_before = date(2017, 4, 1)
train = data[data['timestamp'] <= date_before]
val = data[data['timestamp'] > date_before]

In [9]:
train_data = train[['channel', 'text']].reset_index()[['channel', 'text']]
train_data['channel'] = train_data.channel.map(mappings)
train_data = train_data.sort_values('channel').reset_index()[['channel', 'text']]

val_data = val[['channel', 'text']].reset_index()[['channel', 'text']]
val_data['channel'] = val_data.channel.map(mappings)
val_data = val_data.sort_values('channel').reset_index()[['channel', 'text']]

train_data.text = train_data.text.astype(str)\
    .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
    .apply(lambda x: re.sub('\s+', ' ', x))
train_data = train_data[~train_data.text.apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]

val_data.text = val_data.text.astype(str)\
    .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
    .apply(lambda x: re.sub('\s+', ' ', x))
val_data = val_data[~val_data.text.apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]

train_text = train_data['text'].astype(str).apply(lambda x: x.lower())
train_labels =  np.asarray(train_data['channel'], dtype='int8')

val_text = val_data['text'].astype(str).apply(lambda x: x.lower())
val_labels = np.asarray(val_data['channel'], dtype='int8')

In [14]:
# without tuning accuracy_score = 50.81%
# accuracy_score = 55.33%
classifier = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer = 'char', max_features = 1000000, 
                                                       ngram_range = (1, 7))),
    ('clf', OneVsRestClassifier(LogisticRegression(C=4),n_jobs=-1))])

In [15]:
classifier.fit(train_text, train_labels)

Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000000, min_df=1,
        ngram_range=(1, 7), norm='l2', preprocessor=None, smooth_idf...None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=-1))])

In [16]:
predicted = classifier.predict(val_text)

In [17]:
accuracy_score(predicted, val_labels)

0.5709297814560973

In [18]:
categories = ['career',  'theory_and_practice','deep_learning', 'lang_python', '_meetings', 'kaggle_crackers', 'big_data','lang_r',
               'nlp', 'welcome','datasets','bayesian']

In [19]:
te = TextExplainer(random_state=42)
te.fit('запустил спарк выбило такую ошибку', classifier.predict_proba)
te.show_prediction(target_names=list(categories))

Contribution?,Feature
-0.63,<BIAS>
-4.752,Highlighted in text (sum)

Contribution?,Feature
-0.617,<BIAS>
-0.986,Highlighted in text (sum)

Contribution?,Feature
-0.603,<BIAS>
-2.441,Highlighted in text (sum)

Contribution?,Feature
-0.651,<BIAS>
-3.155,Highlighted in text (sum)

Contribution?,Feature
-0.649,<BIAS>
-6.409,Highlighted in text (sum)

Contribution?,Feature
-0.68,<BIAS>
-2.122,Highlighted in text (sum)

Contribution?,Feature
0.876,Highlighted in text (sum)
-0.729,<BIAS>

Contribution?,Feature
-0.735,<BIAS>
-4.27,Highlighted in text (sum)

Contribution?,Feature
-0.696,<BIAS>
-5.947,Highlighted in text (sum)

Contribution?,Feature
-0.626,<BIAS>
-8.342,Highlighted in text (sum)

Contribution?,Feature
-0.649,<BIAS>
-8.002,Highlighted in text (sum)

Contribution?,Feature
-0.661,<BIAS>
-6.738,Highlighted in text (sum)


In [20]:
classifier.predict(['почему поламался хадуп'])

array([6], dtype=int8)

In [21]:
classifier.predict(val_text)

array([0, 0, 0, ..., 3, 0, 1], dtype=int8)

In [22]:
predicted_val = classifier.decision_function(val_text)

In [23]:
predicted_val.shape

(7733, 12)

In [24]:
softmax = np.exp(predicted_val) / np.sum(np.exp(predicted_val), axis=1)[:, np.newaxis]

In [25]:
np.save('linear_class.npy', softmax)

In [26]:
softmax

array([[  4.61351712e-01,   1.35447457e-01,   1.87563352e-02, ...,
          5.22498893e-03,   5.27965131e-03,   6.61297162e-03],
       [  9.91840480e-01,   1.16525312e-03,   2.09661825e-03, ...,
          1.11670515e-03,   8.22806136e-04,   1.31478626e-04],
       [  9.16945147e-01,   1.98950177e-02,   1.02881853e-02, ...,
          2.98547369e-03,   1.08831280e-03,   6.97736702e-04],
       ..., 
       [  1.03485587e-04,   1.41740453e-02,   4.42353693e-02, ...,
          1.54284763e-03,   8.04817340e-04,   4.57951055e-02],
       [  5.49057154e-01,   2.55937728e-02,   2.63092208e-01, ...,
          2.11408772e-02,   6.23410614e-03,   1.09447158e-02],
       [  4.61410701e-04,   9.06923746e-01,   9.49628126e-03, ...,
          2.46506913e-03,   3.63640817e-03,   2.80671894e-02]])

In [27]:
len(softmax)

7733

In [28]:
# sun in row = 1 ( prediction)
softmax

array([[  4.61351712e-01,   1.35447457e-01,   1.87563352e-02, ...,
          5.22498893e-03,   5.27965131e-03,   6.61297162e-03],
       [  9.91840480e-01,   1.16525312e-03,   2.09661825e-03, ...,
          1.11670515e-03,   8.22806136e-04,   1.31478626e-04],
       [  9.16945147e-01,   1.98950177e-02,   1.02881853e-02, ...,
          2.98547369e-03,   1.08831280e-03,   6.97736702e-04],
       ..., 
       [  1.03485587e-04,   1.41740453e-02,   4.42353693e-02, ...,
          1.54284763e-03,   8.04817340e-04,   4.57951055e-02],
       [  5.49057154e-01,   2.55937728e-02,   2.63092208e-01, ...,
          2.11408772e-02,   6.23410614e-03,   1.09447158e-02],
       [  4.61410701e-04,   9.06923746e-01,   9.49628126e-03, ...,
          2.46506913e-03,   3.63640817e-03,   2.80671894e-02]])

In [29]:
classifier.predict(['как у дата санqтиста зарплата в германии'])

array([0], dtype=int8)

In [30]:
classifier.predict(['какие лучшие подходы для сентимент анализа'])

array([8], dtype=int8)

In [31]:
classifier.predict(['почему поламался хадуп'])

array([6], dtype=int8)

In [32]:
classifier.predict(['кто-то принимает участет в соревоаниях'])

array([5], dtype=int8)

In [33]:
# matrix error
pd.crosstab(predicted, val_labels, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0,1,2,3,4,5,6,7,8,9,10,11,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,1606,98,155,52,17,496,29,13,36,10,8,7,2527
1,101,521,162,136,6,272,31,22,75,2,21,29,1378
2,90,86,641,80,3,231,20,12,60,2,16,13,1254
3,31,96,96,520,4,79,38,26,20,2,5,6,923
4,70,10,25,9,104,86,6,6,7,3,7,2,335
5,26,22,26,11,1,465,7,4,14,0,8,0,584
6,10,9,5,16,0,3,128,4,0,0,3,0,178
7,1,5,5,13,0,3,1,72,2,0,0,0,102
8,2,5,10,6,0,9,0,0,118,0,16,0,166
9,10,5,5,2,4,0,1,0,2,193,0,0,222
