In [174]:
import pandas as pd
import numpy as np
import pymorphy2
import nltk
import stop_words
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.model_selection import KFold, GridSearchCV, train_test_split, cross_val_score, StratifiedKFold, ParameterGrid
from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from datetime import date
import fastnumbers
import re

In [81]:
data = pd.read_csv('../ucu_sentiment-master/data/train_set.csv', usecols=range(1,11),  parse_dates=['timestamp', 'thread_timestamp'])

In [83]:
#select top 12 channels
data =data[data.channel.isin(['career', 'big_data', 'deep_learning', 'kaggle_crackers', 
           'lang_python',  'lang_r', 'nlp', 'theory_and_practice', 'welcome', 'bayesian', '_meetings', 'datasets']) 
           & data.main_msg]

In [85]:
# make labels for channels
mappings = {
    'career': 0,
    'theory_and_practice': 1,
    'deep_learning': 2,
    'lang_python': 3,
    '_meetings': 4,
    'kaggle_crackers': 5,
    'big_data': 6,
    'lang_r': 7,
    'nlp': 8,
    'welcome': 9,
    'datasets': 10,
    'bayesian': 11
}

In [86]:
# mapping labels to channels
data['channel'] = data.channel.map(mappings)
data = data.sort_values('channel').reset_index()

In [91]:
# exclude messages w/o text
data = data.dropna()

In [95]:
# split on data and data val
date_before = date(2017, 4, 1)
train = data[data['timestamp'] < date_before]
val = data[data['timestamp'] > date_before]

In [96]:
train_data = train[['channel', 'text']].reset_index()[['channel', 'text']]
train_data = train_data.sort_values('channel').reset_index()[['channel', 'text']]

val_data = val[['channel', 'text']].reset_index()[['channel', 'text']]
val_data = val_data.sort_values('channel').reset_index()[['channel', 'text']]

from fastnumbers import isfloat, isint
train_data = train_data[~train_data.text.apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]
val_data = val_data[~val_data.text.apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]

train_text = train_data['text'].astype(str)
train_labels =  np.asarray(train_data['channel'], dtype='int8')

val_text = val_data['text'].astype(str)
val_labels = np.asarray(val_data['channel'], dtype='int8')

In [128]:
# library for lemmatization and stop_words
morph = pymorphy2.MorphAnalyzer()

In [140]:
# exclude stop_words & make lemmatization - train data
train_text_lem = []
for sentence in train_text:
    train_text_lem.append([])
    for word in nltk.word_tokenize(sentence):
        if word not in stopwords.words('russian'):
            train_text_lem[-1].append(morph.parse(word)[0].normal_form)

In [142]:
# exclude stop_words & make lemmatization - val data
val_text_lem = []
for sentence in val_text:
    val_text_lem.append([])
    for word in nltk.word_tokenize(sentence):
        if word not in stopwords.words('russian'):
            val_text_lem[-1].append(morph.parse(word)[0].normal_form)
               

In [179]:
# exclude spec symbols
train_text = train_text\
    .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x))\
    .apply(lambda x: re.sub('\s+', ' ', x))
    
val_text_lem = val_text_lem\
    .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x))\
    .apply(lambda x: re.sub('\s+', ' ', x))

In [212]:
# without tuning accuracy_score = 50.81%
# accuracy_score = 53.81%
classifier = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer = 'char', max_features = 1000000, 
                                                       ngram_range = (1, 4))),
    ('clf', OneVsRestClassifier(LinearSVC(),n_jobs=-1))])

In [213]:
classifier.fit(train_text, train_labels)

Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000000, min_df=1,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf...ti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=-1))])

In [160]:
# classifier for data with lemmatization and w/o stop_word - accuracy score = 51%
#classifier.fit([" ".join(i) for i in train_text_lem], train_labels)

Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000000, min_df=1,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf...ti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=-1))])

In [215]:
predicted = classifier.predict(val_text)

In [216]:
accuracy_score(predicted, val_labels)

0.53817073170731711

In [219]:
classifier.predict(['как у дата сайнтиста зарплата в германии'])

array([0], dtype=int8)

In [236]:
classifier.predict(['какие лучшие подходы для сентимент анализа'])

array([8], dtype=int8)

In [234]:
classifier.predict(['почему поламался хадуп'])

array([6], dtype=int8)

In [231]:
classifier.predict(['кто-то принимает участет в соревоаниях'])

array([5], dtype=int8)

In [217]:
# matrix error
pd.crosstab(predicted, val_labels, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0,1,2,3,4,5,6,7,8,9,10,11,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,1544,90,132,57,7,457,28,12,36,12,15,9,2399
1,134,486,150,146,7,264,28,18,72,2,19,22,1348
2,167,92,687,86,41,290,24,55,84,1,21,19,1567
3,42,106,110,510,5,78,33,24,22,2,4,4,940
4,87,17,33,8,108,104,7,8,9,2,6,4,393
5,40,35,44,19,4,493,9,3,13,1,10,1,672
6,17,15,8,13,1,15,141,8,1,0,3,1,223
7,5,14,10,18,0,10,3,74,4,0,0,1,139
8,9,8,15,6,1,14,1,2,123,0,17,1,197
9,12,8,7,3,5,2,1,1,3,192,1,1,236
