# Part 3. Classification

## 1. Data preparation

In [1]:
import bz2, json
import tqdm
import pandas as pd
import numpy as np

In [2]:
path = 'banki_responses.json.bz2'

with bz2.open(path, 'rt') as bzinput:
    lines = []
    for i, cont in tqdm.tqdm(enumerate(bzinput)):
        tweets = json.loads(cont)
        lines.append(tweets)

201030it [02:11, 1533.16it/s]


In [3]:
df = pd.DataFrame(lines)
df.head(3)

Unnamed: 0,city,rating_not_checked,title,num_comments,bank_license,author,bank_name,datetime,text,rating_grade
0,г. Воронеж,True,Ипотека на 5+,0,лицензия № 1623,e.novikova,ВТБ 24,2015-06-08 11:06:56,Здравствуйте! Хотелось бы выразить благодарно...,5.0
1,г. Казань,True,ЗВОНКИ СОСЕДЯМ,0,лицензия № 2289,KZN\Vorontsova_NA,Русский Стандарт,2015-06-06 18:17:52,Уважаемые представители департамента по работе...,2.0
2,г. Санкт-Петербург,True,Ложная информация!!!!,0,лицензия № 2307,Evgenia15,Союз,2015-06-07 19:08:33,Здравствуйте. Столкнулась с такой ситуацией. в...,2.0


In [4]:
import pymorphy3, re, nltk

In [5]:
morph = pymorphy3.MorphAnalyzer()
ru_words = re.compile(r"[А-Яа-я]+")
stops = nltk.corpus.stopwords.words('russian')

def preprocess_text(text):
    
    def check_stop_words(text):
        return filter(lambda x: x not in stops, text)
    
    def lemmatize(text):
        return map(lambda x: morph.parse(x)[0].normal_form, text)
    
    def words_only(text):
        return ru_words.findall(text.lower())

    return ' '.join(lemmatize(check_stop_words(words_only(text))))

### Make a train-test dataset

- Lets make a DS little less but with an original ratio

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
shorten = lambda x, y=10: x[:len(x)//y]

rating_5 = shorten(df[df['rating_grade'] == 5], 50)
rating_1 = shorten(df[df['rating_grade'] == 1], 50)

In [8]:
train_test_ds = pd.concat([rating_1, rating_5])

X = train_test_ds.drop('rating_grade', axis=1)['text'].apply(preprocess_text)
y = train_test_ds['rating_grade']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=15)

In [10]:
X_train

2662    год пользоваться платиновый дебетовый карта сб...
800     создать система пара шаблон оплата коммуналка ...
241     иметь банк кредит причём кредит наличные креди...
3284    г открыть вклад год победа войковский довольно...
712     хотеть сказать большой спасибо сотрудник банк ...
                              ...                        
4816    хороший банк широкий спектр услуга немой неско...
1323    добрый день уважаемый читатель отзыв начать де...
4924    пользоваться банк москва год который разубежда...
3431    уехать неделя родитель санкт петербург помогат...
345     добрый день весь взаимодействовать несколькими...
Name: text, Length: 1584, dtype: object

## 2. Baseline model

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [12]:
from sklearn.metrics import f1_score, accuracy_score

In [13]:
def eval_model(setup_pipe, name=None, verbose=True):
    setup_pipe.fit(X_train, y_train)
    yhat = setup_pipe.predict(X_test)
    acc, f1 = map(lambda x: round(x, 2), (accuracy_score(y_test, yhat), f1_score(y_test, yhat)))
    if verbose:
        print('accuracy:', acc)
        print('f1-score:', f1)
    else:
        return [name, acc, f1]
    # print(classification_report(y_test, yhat))

#### Unigram

In [14]:
baseline = Pipeline([
    ('cv', CountVectorizer(ngram_range=(1,1))),
    ('clf', LogisticRegression())
])

In [15]:
eval_model(baseline)

accuracy: 0.93
f1-score: 0.95


#### 2-gram

In [16]:
baseline_2gram = Pipeline([
    ('cv', CountVectorizer(ngram_range=(2,2))),
    ('clf', LogisticRegression())
])

In [17]:
eval_model(baseline_2gram)

accuracy: 0.87
f1-score: 0.92


#### 3-grams

In [18]:
baseline_3gram = Pipeline([
    ('cv', CountVectorizer(ngram_range=(3,3))),
    ('clf', LogisticRegression())
])

In [19]:
eval_model(baseline_3gram)

accuracy: 0.77
f1-score: 0.86


#### mean N of symbols/word -grams

In [20]:
word_count = lambda x: int(len(x)/len(x.split()))

In [21]:
np.mean(X.apply(word_count))

7.444444444444445

- So words have a mean range of 7-8 symbols

In [22]:
baseline_symgram = Pipeline([
    ('cv', CountVectorizer(ngram_range=(7,8), analyzer='char')),
    ('clf', LogisticRegression())
])

In [23]:
eval_model(baseline_symgram)

accuracy: 0.95
f1-score: 0.97


- Wow!

## 3. Topic modelling

### 3.1 TfIdf + LSA

In [24]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD

In [25]:
LSA_pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('lsa', TruncatedSVD(n_components=20, random_state=5)),
    ('clf', LogisticRegression())
])

In [26]:
eval_model(LSA_pipe)

accuracy: 0.9
f1-score: 0.94


### 3.2 LDA

In [27]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [28]:
LDA_pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('lda', LDA(n_components=10, random_state=0)),
    ('clf', LogisticRegression())
])

In [29]:
eval_model(LDA_pipe)

accuracy: 0.9
f1-score: 0.94


# Summary

In [30]:
models = ('baseline_unigram', 'bigram', '3gram', 'chargram', 'LSA', 'LDA')
pipes = (baseline, baseline_2gram, baseline_3gram, baseline_symgram, LSA_pipe, LDA_pipe)

In [31]:
res = [eval_model(model, name, False) for name, model in zip(models, pipes)]

In [32]:
res_df = pd.DataFrame(res).set_index(0)
res_df.rename({0: 'model', 1:'accuracy', 2:'f1-score'}, axis=1)

Unnamed: 0_level_0,accuracy,f1-score
0,Unnamed: 1_level_1,Unnamed: 2_level_1
baseline_unigram,0.93,0.95
bigram,0.87,0.92
3gram,0.77,0.86
chargram,0.95,0.97
LSA,0.9,0.94
LDA,0.9,0.94
