In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import cufflinks as cf

from scipy.sparse import hstack
from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist

from collections import Counter

from sklearn.manifold import TSNE

from sklearn.model_selection import train_test_split,TimeSeriesSplit
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.pipeline import FeatureUnion

import lightgbm as lgb

import re
import razdel
from pymystem3 import Mystem
import tqdm
tqdm.tqdm_pandas(tqdm.tqdm, leave=True)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering, AffinityPropagation,DBSCAN
from sklearn.decomposition import PCA

from sklearn.metrics import silhouette_score,davies_bouldin_score

import numpy as np

from sklearn.linear_model import LogisticRegressionCV,LogisticRegression

from sklearn.metrics import accuracy_score,roc_auc_score, f1_score,roc_curve,auc

from gensim.models import KeyedVectors




## Data Load

In [2]:
df_train = pd.read_csv('train.csv', encoding='utf-8',sep=',')
df_test = pd.read_csv('test.csv', encoding='utf-8',sep=',')

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112467 entries, 0 to 112466
Data columns (total 3 columns):
id       112467 non-null int64
text     112467 non-null object
label    112467 non-null int64
dtypes: int64(2), object(1)
memory usage: 2.6+ MB


In [9]:
df_train['label'].value_counts()

1    50000
2    40851
0    21616
Name: label, dtype: int64

## Text preparation

In [3]:
from nltk.corpus import stopwords
sw_ru = set(stopwords.words('russian'))

In [4]:
df_train['have_number'] = df_train['text'].apply(lambda x: 1 if re.search(r'\d{10}',x) else 0)
df_train['have_eng'] = df_train['text'].apply(lambda x: 1 if re.search(r'[a-zA-Z]+',x) else 0)
df_train['have_symbols'] = df_train['text'].apply(lambda x: 1 if re.search(r'[*\'\"=]+',x) else 0)
df_train['have_upper'] = df_train['text'].apply(lambda x: 1 if re.search(r'[А-Я][А-Я]+',x) else 0)

## is_upper для знакомств
## сокращения для знакомств

In [5]:
df_test['have_number'] = df_test['text'].apply(lambda x: 1 if re.search(r'\d{10}',x) else 0)
df_test['have_eng'] = df_test['text'].apply(lambda x: 1 if re.search(r'[a-zA-Z]+',x) else 0)
df_test['have_symbols'] = df_test['text'].apply(lambda x: 1 if re.search(r'[*\'\"=]+',x) else 0)
df_test['have_upper'] = df_test['text'].apply(lambda x: 1 if re.search(r'[А-Я][А-Я]+',x) else 0)

In [43]:
mystem = Mystem()

In [7]:
upos_map = {'A':'ADJ','ADV':'ADV','ADVPRO':'ADV','ANUM':'ADJ','APRO':'DET','COM':'ADJ','CONJ':'SCON','INTJ':'INTJ','NONLEX':'X','NUM':'NUM','PART':'PART','PR':'ADP','S':'NOUN','SPRO':'PRON','UNKN':'X' ,'V':'VERB'}

In [8]:
def text_prep(text):
    
    text = text.lower()
    result = []
    # Убираем лишние символы
    #text = re.sub(r'[;,]',r' ',text).strip()
    text = re.sub(r'[^\w\s\.]',r'',text).strip()   
    #text = [token.text for token in razdel.tokenize(text)]
    # Делаем лемматизацию       
    result = [lemma for lemma in mystem.lemmatize(text) if not lemma.isspace() #and lemma not in sw_ru
             and lemma.strip() not in ['.','..','...']]
    

    # Лемматизатор неправильно разбивает слова с дефисом, поэтому исправляем это
#     if '-' in text:
#         for l in range(len(text)):
#             if text[l] == '-':
#                 text[l] = f'{text[l-1]}-{text[l+1]}'
#                 text[l-1] = text[l+1] = text[l]
    
    return result

In [9]:
def text_prep_tags(text):
    
    text = text.lower()
    result = []
    # Убираем лишние символы
    #text = re.sub(r'[;,]',r' ',text).strip()
    text = re.sub(r'[^\w\s\.]',r'',text).strip()   
    #text = [token.text for token in razdel.tokenize(text)]
    # Делаем лемматизацию       
#     text = [lemma for lemma in mystem.lemmatize(text) if not lemma.isspace() and lemma not in sw_ru
#             and lemma.strip() not in ['.','..','...']]
    
    
    for item in mystem.analyze(text):
      #  print(item)
        token = None
        if item.get('analysis'):
            lemma = item['analysis'][0]['lex']
            pos = re.split('[=,]', item['analysis'][0]['gr'])[0]
            #and lemma not in sw_ru
            if not lemma.isspace()  and lemma.strip() not in ['.','..','...']: #and lemma not in sw_ru
     
                token = f'{lemma}_{upos_map[pos]}'
        else:
            lem_text = item["text"]
            if not lem_text.isspace() and lem_text.strip() not in ['.','..','...']: #and lem_text not in sw_ru
            
                token = f'{lem_text}_UNKN'
            
        if token:    
            result.append(token)

    
    
    
    # Лемматизатор неправильно разбивает слова с дефисом, поэтому исправляем это
#     if '-' in text:
#         for l in range(len(text)):
#             if text[l] == '-':
#                 text[l] = f'{text[l-1]}-{text[l+1]}'
#                 text[l-1] = text[l+1] = text[l]
    
    return result

In [158]:
# применяем функцию к признаку

#df_train['prep_text'] = df_train['text'].progress_apply(text_prep)

In [43]:
df_train.loc[:10]['text'].apply(text_prep)

0                 [44, позн, мужч, 40, год, 0678935919]
1                             [вспомнить, это, увидеть]
2                             [sep, decimal, изза, это]
3                    [нужный, дева, 19л, 0637048428смс]
4     [м54, искать, д, сп, ласка, некм, 0508665030, ...
5                [говорить, спать, постоянно, хотеться]
6      [познакомиться, адекватный, мужчина, 0668847450]
7     [модер, доченька, звать, арина, папа, дима, ма...
8                                           [это, попс]
9                 [пар, пообщий, дева, смс, 0935695491]
10           [смотреть, паблик, смотреть, прочий, фолд]
Name: text, dtype: object

In [228]:
text_prep_tags('Мне говорили от них спать постоянно хочется мама приятель')

['говорить_VERB',
 'спать_VERB',
 'постоянно_ADV',
 'хотеться_VERB',
 'мама_NOUN',
 'приятель_NOUN']

In [62]:
df_train.head(10)

Unnamed: 0,id,text,label,have_number,have_eng,have_symbols,have_upper
0,0,ж 44 позн с мужч от 40 лет 0678935919,1,1,0,0,0
1,1,уже и не вспомню где это увидел,2,0,0,0,0
2,2,", sep=';', decimal=',' может изза этого",2,0,1,1,0
3,3,НУЖНА ДЕВ ДЛЯ С\\О -19л.0637048428смс,1,1,0,0,1
4,4,.М54.ИЩУ Д.Ж. С-П.ДЛЯ ЛАСК...НЕКМ.0508665030....,1,1,0,0,1
5,5,Мне говорили от них спать постоянно хочется,2,0,0,0,0
6,6,ПОЗНАКОМЛЮСЬ С АДЕКВАТНЫМ МУЖЧИНОЙ С/О 0668847450,1,1,0,0,1
7,7,модер ДОЧЕНЬКУ ЗОВУТ АРИНА ПАПА ДИМА МАМА НАСТ...,1,0,0,0,1
8,8,"Но это не попса, не надо",2,0,0,0,0
9,9,пар. Пообщ. С дев. По смс. 0935695491,1,1,0,0,0


In [55]:
#pd.concat([df_train['text'],df_test['text']])

## Vectorizing

In [44]:
# Преобразуем нашу информацию в вектора методом tf-idf
idf_vectorizer = TfidfVectorizer(min_df = 2,tokenizer=text_prep)

#idf_vectorizer.fit(pd.concat([df_train['text'],df_test['text']]))

In [58]:
#idf_vector

In [11]:
X_train,X_test,y_train,y_test = train_test_split(df_train,df_train['label'],test_size=0.2,random_state=20)

In [10]:
X_train_idf_vector = idf_vectorizer.transform(tqdm.tqdm_notebook(X_train['text']))
X_test_idf_vector = idf_vectorizer.transform(tqdm.tqdm_notebook(X_test['text']))

HBox(children=(IntProgress(value=0, max=89973), HTML(value='')))




HBox(children=(IntProgress(value=0, max=22494), HTML(value='')))




In [54]:
# Final prediction
fin_train_idf_vector = idf_vectorizer.transform(tqdm.tqdm_notebook(df_train['text']))
fin_test_idf_vector = idf_vectorizer.transform(tqdm.tqdm_notebook(df_test['text']))

In [60]:
fin_train_idf_vector

<112467x57519 sparse matrix of type '<class 'numpy.float64'>'
	with 773992 stored elements in Compressed Sparse Row format>

### Word2Vec

In [12]:
word2vec = KeyedVectors.load_word2vec_format('tayga-func_upos_skipgram_300_5_2019.bin', binary=True)

In [115]:
#word2vec.most_similar(positive=['обама::барак_NOUN'])

In [231]:
#word2vec['парень_NOUN']

In [13]:
from sklearn.base import TransformerMixin


class Word2VecVectorizer(TransformerMixin):
    def __init__(self, vectors):
        self.vectors = vectors
        self.zeros = np.zeros(self.vectors.vector_size)
        
    def _get_text_vector(self, text):
        token_vectors = []
        for token in text_prep_tags(text):
            try:
                token_vectors.append(self.vectors[token])
            except KeyError: # не нашли такой токен в словаре
                pass
                
        if not token_vectors:
            return self.zeros

        text_vector = np.sum(token_vectors, axis=0)
        return text_vector / np.linalg.norm(text_vector)
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return np.array([self._get_text_vector(text) for text in X])

In [45]:
word2vec_vectorizer = Word2VecVectorizer(word2vec)

In [178]:
X_train_w2v_vector = word2vec_vectorizer.fit_transform(tqdm.tqdm_notebook(X_train['text']))
X_test_w2v_vector = word2vec_vectorizer.fit_transform(tqdm.tqdm_notebook(X_test['text']))

HBox(children=(IntProgress(value=0, max=89973), HTML(value='')))




HBox(children=(IntProgress(value=0, max=22494), HTML(value='')))




In [136]:
X_train['text'].loc[:15046].apply(text_prep)

64390                         [парень_S, девушка_S, лайф_S]
108881    [задавать_V, early_UNKN, __UNKN, stopping_UNKN...
39023               [равный_A, колво_S, столбец_S, чтоль_S]
15381     [привет_S, дешево_ADV, арендовать_V, сервер_S,...
28059     [прочитывать_V, читать_V, hacking_UNKN, guide_...
74909               [интригантка_S, чертов_A, искатьр_UNKN]
40571     [ребята_S, правильно_ADV, писаться_V, полность...
60915     [почему_ADVPRO, заставлять_V, страдать_V, самы...
91119     [отношение_S, штамп_S, заканчиваться_V, это_PA...
865       [жак_S, фреско_ADV, хороший_A, купить_V, деньг...
15046     [м32_UNKN, познакомиться_V, дж_UNKN, 066_UNKN,...
Name: text, dtype: object

In [234]:
#X_train_w2v_vector[6]

### Feature Union


In [15]:
fun = FeatureUnion(
        [
            ('tf-idf', idf_vectorizer),
            ('word2vec', word2vec_vectorizer),
        ]
    )
fun = fun.fit(tqdm.tqdm_notebook(pd.concat([df_train['text']]))) #,df_test['text']
X_train_fun = fun.transform(tqdm.tqdm_notebook(X_train['text']))
X_test_fun = fun.transform(tqdm.tqdm_notebook(X_test['text']))

HBox(children=(IntProgress(value=0, max=112467), HTML(value='')))




HBox(children=(IntProgress(value=0, max=89973), HTML(value='')))




HBox(children=(IntProgress(value=0, max=22494), HTML(value='')))




In [46]:
# FInal version

fun2 = FeatureUnion(
        [
            ('tf-idf', idf_vectorizer),
            ('word2vec', word2vec_vectorizer),
        ]
    )
fun2 = fun2.fit(tqdm.tqdm_notebook(pd.concat([df_train['text'],df_test['text']]))) #,df_test['text']
fin_train_fun = fun2.transform(tqdm.tqdm_notebook(df_train['text']))
fin_test_fun = fun2.transform(tqdm.tqdm_notebook(df_test['text']))

HBox(children=(IntProgress(value=0, max=224933), HTML(value='')))

HBox(children=(IntProgress(value=0, max=112467), HTML(value='')))

HBox(children=(IntProgress(value=0, max=112466), HTML(value='')))

### Adding features

In [34]:
new_X_train_idf_vector = hstack((X_train_idf_vector,np.array(X_train['have_symbols'])[:,None]))
new_X_test_idf_vector = hstack((X_test_idf_vector,np.array(X_test['have_symbols'])[:,None]))

In [39]:
new_X_train_fun = hstack((X_train_fun,X_train[['have_symbols','have_number','have_eng','have_upper']].values))
new_X_test_fun = hstack((X_test_fun,X_test[['have_symbols','have_number','have_eng','have_upper']].values))

In [45]:
X_train[['have_symbols','have_number','have_eng','have_upper']].values

array([[1, 0, 0, 0],
       [1, 0, 1, 0],
       [0, 0, 0, 0],
       ...,
       [0, 0, 1, 0],
       [0, 0, 0, 0],
       [0, 0, 1, 0]])

In [69]:
max(new_X_train_idf_vector.getrow(1).toarray()[0][:52000])

0.5774018004541697

In [55]:
# Final prediction
fin_train_idf_vector = hstack((fin_train_idf_vector,df_train[['have_symbols','have_number','have_eng','have_upper']].values))
fin_test_idf_vector = hstack((fin_test_idf_vector,df_test[['have_symbols','have_number','have_eng','have_upper']].values))

In [47]:
# Final prediction
fin_train_fun = hstack((fin_train_fun,df_train[['have_symbols','have_number','have_eng','have_upper']].values))
fin_test_fun = hstack((fin_test_fun,df_test[['have_symbols','have_number','have_eng','have_upper']].values))

## Modeling

In [18]:
cv = StratifiedKFold(n_splits=5, shuffle=True)

### NN

In [78]:
nn = MLPClassifier((256,100),random_state=20,verbose=True,max_iter=1,learning_rate_init=0.001)

In [38]:
nn.fit(X_train_fun,y_train)
y_pred = nn.predict(X_test_fun)
f1_score(y_test,y_pred,average='weighted')

Iteration 1, loss = 0.31952142
Iteration 2, loss = 0.21086383
Iteration 3, loss = 0.16021787
Iteration 4, loss = 0.12032247
Iteration 5, loss = 0.08934619
Iteration 6, loss = 0.07349143



Stochastic Optimizer: Maximum iterations (6) reached and the optimization hasn't converged yet.



0.8651809400734689

In [77]:
np.mean(cross_val_score(nn,new_X_train_fun,y_train,scoring='f1_weighted',n_jobs=-1,cv=cv))

0.8857310140234388

In [79]:
#Final version
nn.fit(fin_train_fun,df_train['label'])
y_fin_pred = nn.predict(fin_test_fun)

Iteration 1, loss = 0.30896300



Stochastic Optimizer: Maximum iterations (1) reached and the optimization hasn't converged yet.



In [55]:
y_fin_pred

array([2, 2, 2, ..., 2, 2, 0])

### SVM

In [18]:
svc = SVC(verbose=True,cache_size=1024)

In [19]:
svc.fit(X_train_idf_vector,y_train)
y_pred = svc.predict(X_test_idf_vector)
f1_score(y_test,y_pred,average='weighted')





[LibSVM]


F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



0.27831785055244673

In [20]:
#cross_val_score(svc,X_train_idf_vector,y_train,scoring='f1_weighted',n_jobs=-1,cv=cv)

### Logit

In [48]:
logit = LogisticRegression(n_jobs=-1, random_state=20,C=1,multi_class='multinomial',solver='lbfgs')

In [20]:
logit.fit(X_train_fun,y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=-1, penalty='l2', random_state=20, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [21]:
y_pred = logit.predict(X_test_fun)
f1_score(y_test,y_pred,average='weighted')

0.875766148495799

In [61]:
y_pred

array([2, 2, 2, ..., 2, 1, 0])

In [50]:
np.mean(cross_val_score(logit,new_X_train_fun,y_train,scoring='f1_weighted',n_jobs=-1,cv=cv))

ValueError: Found input variables with inconsistent numbers of samples: [112467, 89973]

In [49]:
# Final logit
logit.fit(fin_train_fun,df_train['label'])
y_fin_pred = logit.predict(fin_test_fun)

### Lgbm

In [22]:
# Инициализируем lightGBM
lgbcl = lgb.LGBMClassifier(
        #num_leaves=31,
        #max_depth = 7,
        learning_rate=0.5,
        n_estimators=800,
        #subsample=.9,
        #colsample_bytree=.9,
        random_state=20,
        n_jobs=-1,
        objective='multiclass',
        num_class=3
    )

        

In [25]:
# Обучаем классификатор
lgbcl.fit(
         X_train_fun,  y_train,
        eval_set=[( X_test_fun, y_test)],
        early_stopping_rounds=100,
    #categorical_feature=cat_cols,
        verbose=100)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 0.334903
[200]	valid_0's multi_logloss: 0.335143
Early stopping, best iteration is:
[171]	valid_0's multi_logloss: 0.333562


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.5, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=800, n_jobs=-1, num_class=3, num_leaves=31,
        objective='multiclass', random_state=20, reg_alpha=0.0,
        reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [26]:
y_pred = lgbcl.predict(X_test_fun)
f1_score(y_test,y_pred,average='weighted')

0.8527844966923913

In [23]:
np.mean(cross_val_score(lgbcl,X_train_fun,y_train,scoring='f1_weighted',n_jobs=-1,cv=cv))

0.8546487426045941

In [103]:
# Обучаем классификатор
lgbcl.fit(
        fin_train_idf_vector,  df_train['label'],
        #eval_set=[( X_test_idf_vector, y_test)],
        #early_stopping_rounds=100,
    #categorical_feature=cat_cols,
        verbose=100)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.5, max_depth=7,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=800, n_jobs=-1, num_class=3, num_leaves=31,
        objective='multiclass', random_state=20, reg_alpha=0.0,
        reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [104]:
y_fin_pred = lgbcl.predict(fin_test_idf_vector)
#f1_score(y_test,y_pred,average='weighted')

## To CSV

In [80]:
# Final save to csv
submit = pd.read_csv('sample.csv', encoding='utf-8',sep=',',index_col='id')
submit['label'] = y_fin_pred
submit.to_csv('predict/union_nn_256_100_1epoch_0.001_4features.csv', encoding='utf-8',sep=',')