In [124]:
import pandas as pd
import numpy as np
import pymorphy2
import nltk
import stop_words
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.model_selection import KFold, GridSearchCV, train_test_split, cross_val_score, StratifiedKFold, ParameterGrid
from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

In [81]:
data = pd.read_csv('../ucu_sentiment-master/data/train_set.csv', usecols=range(1,11),  parse_dates=['timestamp', 'thread_timestamp'])

In [82]:
data.head().T

Unnamed: 0,0,1,2,3,4
msg_id,10648620600,10648631200,10648645600,10648664700,10648667500
user_id,U0KPCJWAC,U04CH4QBD,U065VP6F7,U0KPCJWAC,U04BFDYPV
channel,_call_4_collaboration,_call_4_collaboration,_call_4_collaboration,_call_4_collaboration,_call_4_collaboration
timestamp,2016-05-17 11:30:06,2016-05-17 11:31:52,2016-05-17 11:34:16,2016-05-17 11:37:27,2016-05-17 11:37:55
text,Всем привет! У нас с <@U04BFDYPV> есть идея сд...,а почему не писать статьи в <http://ods.ai|ods...,n+1 слишком популярно и без формул?,<http://opendatascience.ru|opendatascience.ru>...,<@U065VP6F7>: Принцип примерно такой: у тебя е...
main_msg,1,1,1,1,1
thread_id,10648620600,10648631200,10648645600,10648664700,10648667500
thread_timestamp,2016-05-17 11:30:06,2016-05-17 11:31:52,2016-05-17 11:34:16,2016-05-17 11:37:27,2016-05-17 11:37:55
replies_count,0,0,0,0,0
reactions,"{'count': 6, 'name': '+1'}",{},{},{},{}


In [83]:
data =data[data.channel.isin(['career', 'big_data', 'deep_learning', 'kaggle_crackers', 
           'lang_python',  'lang_r', 'nlp', 'theory_and_practice', 'welcome', 'bayesian', '_meetings', 'datasets']) 
           & data.main_msg]

In [84]:
data.head()

Unnamed: 0,msg_id,user_id,channel,timestamp,text,main_msg,thread_id,thread_timestamp,replies_count,reactions
15251,7002610800,U040HKJE7,_meetings,2015-03-22 11:41:48,я немного заработался на этой неделе и забыл д...,1,7002610800,2015-03-22 11:41:48,0,{}
15252,7002629100,U040HKJE7,_meetings,2015-03-22 11:44:51,"теперь тизер, который я бы хотел обсудить. и п...",1,7002629100,2015-03-22 11:44:51,0,{}
15253,7002632300,U040HKJE7,_meetings,2015-03-22 11:45:23,"оно кому-нибудь надо, интересно? есть деи, мыс...",1,7002632300,2015-03-22 11:45:23,0,{}
15254,7002650700,U040HKJE7,_meetings,2015-03-22 11:48:27,за мной кстати должок перед когнитивистами. я ...,1,7002650700,2015-03-22 11:48:27,0,{}
15255,7002654400,U040HKJE7,_meetings,2015-03-22 11:49:04,...я просто могу выложить пару статей и 3 экра...,1,7002654400,2015-03-22 11:49:04,0,{}


In [85]:
mappings = {
    'career': 0,
    'theory_and_practice': 1,
    'deep_learning': 2,
    'lang_python': 3,
    '_meetings': 4,
    'kaggle_crackers': 5,
    'big_data': 6,
    'lang_r': 7,
    'nlp': 8,
    'welcome': 9,
    'datasets': 10,
    'bayesian': 11
}

In [86]:
data['channel'] = data.channel.map(mappings)
data = data.sort_values('channel').reset_index()


In [87]:
data

Unnamed: 0,index,msg_id,user_id,channel,timestamp,text,main_msg,thread_id,thread_timestamp,replies_count,reactions
0,212514,12950696802,U3HM4KY14,0,2017-02-07 22:09:28,"а если ещё сделать TDD для саентологов, то нас...",1,12950696802,2017-02-07 22:09:28,0,{}
1,199780,12591043100,U14BPHDK6,0,2016-12-28 07:07:11,срочно в <#C04DA5FUF|_jobs>! :joy:,1,12591043100,2016-12-28 07:07:11,0,{}
2,199781,12591050400,U09JEC7V0,0,2016-12-28 07:08:24,под требования канала подходит - надо постить,1,12591050400,2016-12-28 07:08:24,0,{}
3,199782,12591050700,U14BPHDK6,0,2016-12-28 07:08:27,"а, так это парт-тайм и научный сотрудник :good...",1,12591050700,2016-12-28 07:08:27,0,{}
4,199783,12591051100,U1CF22N7J,0,2016-12-28 07:08:31,"Мне кажется это лишь формально называется ""кон...",1,12591051100,2016-12-28 07:08:31,0,"{'count': 2, 'name': 'coincidence'}"
5,199784,12591056000,U0J1U64FK,0,2016-12-28 07:09:20,откуда получается число 5470?,1,12591056000,2016-12-28 07:09:20,0,"{'count': 1, 'name': 'bitcoin'}"
6,199785,12591056800,U1CF22N7J,0,2016-12-28 07:09:28,Выбор из пустого множества претендентов,1,12591056800,2016-12-28 07:09:28,0,{}
7,199786,12591059200,U1CF22N7J,0,2016-12-28 07:09:52,"Откуда? Просто зп такая, обязательно откуда-то...",1,12591059200,2016-12-28 07:09:52,0,{}
8,199787,12591059700,U14BPHDK6,0,2016-12-28 07:09:57,а что такое свидетельства программ эвм и как и...,1,12591059700,2016-12-28 07:09:57,0,{}
9,199779,12591039100,U19HATM25,0,2016-12-28 07:06:31,<http://icm.krasn.ru/page.php?page=news_2016_1...,1,12591039100,2016-12-28 07:06:31,0,"{'count': 1, 'name': 'norma'}"


In [88]:
data.text.isnull().sum()

56

In [89]:
data[data.isnull().any(axis=1)]

Unnamed: 0,index,msg_id,user_id,channel,timestamp,text,main_msg,thread_id,thread_timestamp,replies_count,reactions
3807,203379,12786900400,U14BPHDK6,0,2017-01-19 23:10:04,,1,12786900400,2017-01-19 23:10:04,0,{}
8915,197051,12448604000,U14BPHDK6,0,2016-12-11 19:27:20,,1,12448604000,2016-12-11 19:27:20,0,{}
12874,224165,13249950110,U041SH27M,0,2017-03-14 13:25:01,,1,13249950110,2017-03-14 13:25:01,0,{}
13421,215586,13078688503,U1NLYUY5N,0,2017-02-22 17:41:25,,1,13078688503,2017-02-22 17:41:25,1,{}
13423,215588,13078691903,U1NLYUY5N,0,2017-02-22 17:41:59,,1,13078691903,2017-02-22 17:41:59,0,{}
15830,239438,14034833584,U040HKJE7,0,2017-06-13 09:38:55,,1,14034833584,2017-06-13 09:38:55,26,"{'count': 1, 'name': '+1'}"
16787,231026,13534290048,U1CF22N7J,0,2017-04-16 11:15:00,,1,13534290048,2017-04-16 11:15:00,1,{}
17365,229374,13450639038,U3Z5KPTRQ,0,2017-04-06 18:53:10,,1,13450639038,2017-04-06 18:53:10,0,{}
18208,208415,12845433001,U14BPHDK6,0,2017-01-26 17:45:30,,1,12845433001,2017-01-26 17:45:30,0,{}
19898,206258,12829060401,U1NLYUY5N,0,2017-01-24 20:16:44,,1,12829060401,2017-01-24 20:16:44,0,{}


In [90]:
# parameters initialization
VALIDATION_SPLIT = 0.1
RANDOM_SEED = 42

In [91]:
data = data.dropna()

In [92]:
data

Unnamed: 0,index,msg_id,user_id,channel,timestamp,text,main_msg,thread_id,thread_timestamp,replies_count,reactions
0,212514,12950696802,U3HM4KY14,0,2017-02-07 22:09:28,"а если ещё сделать TDD для саентологов, то нас...",1,12950696802,2017-02-07 22:09:28,0,{}
1,199780,12591043100,U14BPHDK6,0,2016-12-28 07:07:11,срочно в <#C04DA5FUF|_jobs>! :joy:,1,12591043100,2016-12-28 07:07:11,0,{}
2,199781,12591050400,U09JEC7V0,0,2016-12-28 07:08:24,под требования канала подходит - надо постить,1,12591050400,2016-12-28 07:08:24,0,{}
3,199782,12591050700,U14BPHDK6,0,2016-12-28 07:08:27,"а, так это парт-тайм и научный сотрудник :good...",1,12591050700,2016-12-28 07:08:27,0,{}
4,199783,12591051100,U1CF22N7J,0,2016-12-28 07:08:31,"Мне кажется это лишь формально называется ""кон...",1,12591051100,2016-12-28 07:08:31,0,"{'count': 2, 'name': 'coincidence'}"
5,199784,12591056000,U0J1U64FK,0,2016-12-28 07:09:20,откуда получается число 5470?,1,12591056000,2016-12-28 07:09:20,0,"{'count': 1, 'name': 'bitcoin'}"
6,199785,12591056800,U1CF22N7J,0,2016-12-28 07:09:28,Выбор из пустого множества претендентов,1,12591056800,2016-12-28 07:09:28,0,{}
7,199786,12591059200,U1CF22N7J,0,2016-12-28 07:09:52,"Откуда? Просто зп такая, обязательно откуда-то...",1,12591059200,2016-12-28 07:09:52,0,{}
8,199787,12591059700,U14BPHDK6,0,2016-12-28 07:09:57,а что такое свидетельства программ эвм и как и...,1,12591059700,2016-12-28 07:09:57,0,{}
9,199779,12591039100,U19HATM25,0,2016-12-28 07:06:31,<http://icm.krasn.ru/page.php?page=news_2016_1...,1,12591039100,2016-12-28 07:06:31,0,"{'count': 1, 'name': 'norma'}"


In [93]:
from datetime import date
import fastnumbers

In [94]:
data.head(10)

Unnamed: 0,index,msg_id,user_id,channel,timestamp,text,main_msg,thread_id,thread_timestamp,replies_count,reactions
0,212514,12950696802,U3HM4KY14,0,2017-02-07 22:09:28,"а если ещё сделать TDD для саентологов, то нас...",1,12950696802,2017-02-07 22:09:28,0,{}
1,199780,12591043100,U14BPHDK6,0,2016-12-28 07:07:11,срочно в <#C04DA5FUF|_jobs>! :joy:,1,12591043100,2016-12-28 07:07:11,0,{}
2,199781,12591050400,U09JEC7V0,0,2016-12-28 07:08:24,под требования канала подходит - надо постить,1,12591050400,2016-12-28 07:08:24,0,{}
3,199782,12591050700,U14BPHDK6,0,2016-12-28 07:08:27,"а, так это парт-тайм и научный сотрудник :good...",1,12591050700,2016-12-28 07:08:27,0,{}
4,199783,12591051100,U1CF22N7J,0,2016-12-28 07:08:31,"Мне кажется это лишь формально называется ""кон...",1,12591051100,2016-12-28 07:08:31,0,"{'count': 2, 'name': 'coincidence'}"
5,199784,12591056000,U0J1U64FK,0,2016-12-28 07:09:20,откуда получается число 5470?,1,12591056000,2016-12-28 07:09:20,0,"{'count': 1, 'name': 'bitcoin'}"
6,199785,12591056800,U1CF22N7J,0,2016-12-28 07:09:28,Выбор из пустого множества претендентов,1,12591056800,2016-12-28 07:09:28,0,{}
7,199786,12591059200,U1CF22N7J,0,2016-12-28 07:09:52,"Откуда? Просто зп такая, обязательно откуда-то...",1,12591059200,2016-12-28 07:09:52,0,{}
8,199787,12591059700,U14BPHDK6,0,2016-12-28 07:09:57,а что такое свидетельства программ эвм и как и...,1,12591059700,2016-12-28 07:09:57,0,{}
9,199779,12591039100,U19HATM25,0,2016-12-28 07:06:31,<http://icm.krasn.ru/page.php?page=news_2016_1...,1,12591039100,2016-12-28 07:06:31,0,"{'count': 1, 'name': 'norma'}"


In [95]:
# data_train = data.
date_before = date(2017, 4, 1)
train = data[data['timestamp'] < date_before]
val = data[data['timestamp'] > date_before]

In [96]:
train_data = train[['channel', 'text']].reset_index()[['channel', 'text']]
train_data = train_data.sort_values('channel').reset_index()[['channel', 'text']]

val_data = val[['channel', 'text']].reset_index()[['channel', 'text']]
val_data = val_data.sort_values('channel').reset_index()[['channel', 'text']]

from fastnumbers import isfloat, isint
train_data = train_data[~train_data.text.apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]
val_data = val_data[~val_data.text.apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]

train_text = train_data['text'].astype(str)
train_labels =  np.asarray(train_data['channel'], dtype='int8')

val_text = val_data['text'].astype(str)
val_labels = np.asarray(val_data['channel'], dtype='int8')

In [128]:
morph = pymorphy2.MorphAnalyzer()

In [140]:
train_text_lem = []
for sentence in train_text:
    train_text_lem.append([])
    for word in nltk.word_tokenize(sentence):
        if word not in stopwords.words('russian'):
            train_text_lem[-1].append(morph.parse(word)[0].normal_form)

In [157]:
train_text_lem_1 = [" ".join(i) for i in train_text_lem]

In [142]:
val_text_lem = []
for sentence in val_text:
    val_text_lem.append([])
    for word in nltk.word_tokenize(sentence):
        if word not in stopwords.words('russian'):
            val_text_lem[-1].append(morph.parse(word)[0].normal_form)
               

In [145]:
# without tuning accuracy_score = 50.81%
# accuracy_score = 58.26%
classifier = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer = 'char', max_features = 1000000, 
                                                       ngram_range = (1, 4))),
    ('clf', OneVsRestClassifier(LinearSVC(),n_jobs=-1))])

In [99]:
classifier.fit(train_text, train_labels)

Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000000, min_df=1,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf...ti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=-1))])

In [160]:
classifier.fit([" ".join(i) for i in train_text_lem], train_labels)

Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000000, min_df=1,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf...ti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=-1))])

In [162]:
predicted = classifier.predict([" ".join(i) for i in val_text_lem])

In [163]:
accuracy_score(predicted, val_labels)

0.58085365853658533

In [164]:
pd.crosstab(predicted, val_labels, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0,1,2,3,4,5,6,7,8,9,10,11,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,1660,97,147,54,15,473,31,8,35,19,14,4,2557
1,117,501,146,148,7,231,27,18,70,2,21,25,1313
2,79,84,707,75,9,245,13,16,63,2,18,15,1326
3,51,106,104,514,3,69,30,25,26,1,6,6,941
4,75,12,18,13,133,72,6,5,5,3,8,3,353
5,28,35,32,21,3,585,4,5,16,0,6,2,737
6,17,8,10,14,1,19,156,5,1,1,2,1,235
7,13,9,5,14,4,11,4,118,2,0,2,2,184
8,6,14,17,10,0,16,2,3,142,0,15,0,225
9,12,7,4,4,4,6,1,2,4,184,1,0,229


In [146]:
param_grid = {"min_df": [1,3,5],
              "max_features": [100000, 1000000],
              "ngram_range":[(1,2), (1,3), (1,4)]}

In [147]:
grid = ParameterGrid(param_grid)

In [148]:
for params in grid:
    classifier = Pipeline([
                        ('vectorizer', CountVectorizer(min_df = params['min_df'], max_features = params['max_features'], 
                                                       ngram_range = params['ngram_range'])),
                        ('clf', OneVsRestClassifier(LinearSVC()))])
    classifier.fit(data_train, labels_train)
    predicted = classifier.predict(data_test)
    accuracy = accuracy_score(predicted, labels_test)
    print('Parameters: {}\n Score:{}'.format(params, accuracy))

Parameters: {'min_df': 1, 'max_features': 100000, 'ngram_range': (1, 2)}
 Score:0.4976190476190476
Parameters: {'min_df': 1, 'max_features': 100000, 'ngram_range': (1, 3)}
 Score:0.495
Parameters: {'min_df': 1, 'max_features': 100000, 'ngram_range': (1, 4)}
 Score:0.4955555555555556
Parameters: {'min_df': 3, 'max_features': 100000, 'ngram_range': (1, 2)}
 Score:0.49563492063492065
Parameters: {'min_df': 3, 'max_features': 100000, 'ngram_range': (1, 3)}
 Score:0.49603174603174605
Parameters: {'min_df': 3, 'max_features': 100000, 'ngram_range': (1, 4)}
 Score:0.4946031746031746
Parameters: {'min_df': 5, 'max_features': 100000, 'ngram_range': (1, 2)}
 Score:0.48507936507936505
Parameters: {'min_df': 5, 'max_features': 100000, 'ngram_range': (1, 3)}
 Score:0.48563492063492064
Parameters: {'min_df': 5, 'max_features': 100000, 'ngram_range': (1, 4)}
 Score:0.4847619047619048
Parameters: {'min_df': 1, 'max_features': 1000000, 'ngram_range': (1, 2)}
 Score:0.5063492063492063
Parameters: {'min_