# Disclaimer
### This dataset was published on Kaggle by <a href="https://www.kaggle.com/blackmoon">АнатолийБельчиков</a>. It was used by <a href="https://www.kaggle.com/andrewkalita">me</a> just as an example of text classification in Russian language. If you speak Russian and this corpus unacceptable for you to read, please visit <a href="https://www.google.com/search?q=котики&tbm=isch">this site</a>.

In [2]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn import model_selection
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn import metrics

import nltk
from nltk.stem.snowball import SnowballStemmer

import matplotlib.pyplot as plt
import seaborn as sns
import re

In [3]:
nltk.download('averaged_perceptron_tagger_ru')

[nltk_data] Error loading averaged_perceptron_tagger_ru: <urlopen
[nltk_data]     error [SSL: CERTIFICATE_VERIFY_FAILED] certificate
[nltk_data]     verify failed: unable to get local issuer certificate
[nltk_data]     (_ssl.c:1129)>


False

In [4]:
dfComments = pd.read_csv('data/labeled.csv')
dfComments.head(10)

Unnamed: 0,comment,toxic
0,"Верблюдов-то за что? Дебилы, бл...\n",1.0
1,"Хохлы, это отдушина затюканого россиянина, мол...",1.0
2,Собаке - собачья смерть\n,1.0
3,"Страницу обнови, дебил. Это тоже не оскорблени...",1.0
4,"тебя не убедил 6-страничный пдф в том, что Скр...",1.0
5,Для каких стан является эталоном современная с...,1.0
6,В шапке были ссылки на инфу по текущему фильму...,0.0
7,УПАД Т! ТАМ НЕЛЬЗЯ СТРОИТЬ! ТЕХНОЛОГИЙ НЕТ! РА...,1.0
8,"Ебать тебя разносит, шизик.\n",1.0
9,"Обосрался, сиди обтекай\n",1.0


In [5]:
dfComments.tail(10)

Unnamed: 0,comment,toxic
14402,"ЭЙ МИХАЛЫЧ, ТАМ ПОЖАРКА ПИЩИТ ДА ХУЙ С НЕЙ ПУЩ...",1.0
14403,Пусть лучше само евровидение стримит. Долбоеб ...,1.0
14404,"ЖЕНЩИНА это ВЕНЕЦ ТВОРЕНИЯ, помните это ваньки...",1.0
14405,Сейчас все авиакомпании мира вместе со специал...,1.0
14406,Потому что запад прошел эту хуйню еще пару сот...,0.0
14407,Вонючий совковый скот прибежал и ноет. А вот и...,1.0
14408,А кого любить? Гоблина тупорылого что-ли? Или ...,1.0
14409,"Посмотрел Утомленных солнцем 2. И оказалось, ч...",0.0
14410,КРЫМОТРЕД НАРУШАЕТ ПРАВИЛА РАЗДЕЛА Т.К В НЕМ Н...,1.0
14411,До сих пор пересматриваю его видео. Орамбо кст...,0.0


In [None]:
desc = dfComments.groupby('toxic').describe()

plt.bar('0', desc['comment']['count'][0], label="Non toxical comment", color='green')
plt.bar('1', desc['comment']['count'][1], label="Toxical comment", color='red')
plt.legend()
plt.ylabel('Number of comments')
plt.title('Comment groups')
plt.show()

print('Comment description\n')
print(desc)
print()
print(dfComments.describe())

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(10,5))

ax1.hist(dfComments[dfComments['toxic']==0]['comment'].str.len() ,color='green')
ax1.set_title('non toxic')

ax2.hist(dfComments[dfComments['toxic']==1]['comment'].str.len() ,color='red')
ax2.set_title('toxic')

fig.suptitle('Characters in comments')
plt.show()

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(10,5))

ax1.hist(dfComments[dfComments['toxic']==0]['comment'].str.split().map(lambda x: len(x)) ,color='green')
ax1.set_title('non toxic')

ax2.hist(dfComments[dfComments['toxic']==1]['comment'].str.split().map(lambda x: len(x)) ,color='red')
ax2.set_title('toxic')

fig.suptitle('Words in comments')
plt.show()

### Preporation

In [None]:
text = np.array(dfComments.comment.values)
target = dfComments.toxic.astype(int).values

In [None]:
def upperCaseRate(string):
    "Returns percentage of uppercase letters in the string"
    return np.array(list(map(str.isupper, string))).mean()

In [None]:
upcaseRate = list(map(upperCaseRate, dfComments.comment.values))

In [None]:
def cleanText(string):
    """This function deletes all symbols except Cyrilic and Base Latin alphabet,
    stopwords, functional parts of speech. Returns string of words stem."""
    # Common cleaning
    string = string.lower()
    string = re.sub(r"http\S+", "", string)
    string = str.replace(string,'Ё','е')
    string = str.replace(string,'ё','е')
    prog = re.compile('[А-Яа-яA-Za-z]+')
    words = prog.findall(string.lower())
    
    # Word Cleaning
    ## Stop Words
    stopwords = nltk.corpus.stopwords.words('russian')
    words = [w for w in words if w not in stopwords]
    ## Cleaning functional POS (Parts of Speech)
    functionalPos = {'CONJ', 'PRCL'}
    words = [w for w, pos in nltk.pos_tag(words, lang='rus') if pos not in functionalPos]
    ## Stemming
    stemmer = SnowballStemmer('russian')
    return ' '.join(list(map(stemmer.stem, words)))

In [None]:
%%time
text = list(map(cleanText, text))

## Proccessing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(text, target, test_size=.3, stratify=target, shuffle = True, random_state=0)
print('Dim of train:', len(X_train), '\tTarget rate: {:.2f}%'.format(y_train.mean()))
print("Dim of test:", len(X_test), '\tTarget rate: {:.2f}%'.format(y_test.mean()))

## Baseline

In [None]:
clf_pipeline = Pipeline(
            [("vectorizer", TfidfVectorizer()), # Prod feature: tokenizer=cleanText
            ("classifier", LinearSVC())]
        )

clf_pipeline.fit(X_train, y_train)

In [None]:
cm = metrics.confusion_matrix(y_test, clf_pipeline.predict(X_test))

def plotConfusionMatrix(cm):
    fig = plt.figure(figsize=(7,7))
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title('Confusion Matrix')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    return None

plotConfusionMatrix(cm)

In [None]:
print(metrics.classification_report(y_test, clf_pipeline.predict(X_test)))
f1_base = metrics.f1_score(y_test, clf_pipeline.predict(X_test))

## Adjusts

In [None]:
print('\n'.join(clf_pipeline.get_params().keys()))

In [None]:
parameters = {'vectorizer__max_features': (10**3, 10**4),
              'vectorizer__ngram_range': ((1, 2),(2, 3)),
              'classifier__penalty': ('l1','l2'),
              'classifier__C': (range(1,10,2))
             }

In [None]:
%%time
gs_clf = GridSearchCV(clf_pipeline, parameters, scoring='f1', cv = 4, n_jobs=-1)
gs_clf.fit(X_train, y_train)

In [None]:
print(metrics.classification_report(y_test, gs_clf.predict(X_test)))
f1_gsLSVC = metrics.f1_score(y_test, gs_clf.predict(X_test))

### Randomized Search CV on TfidfVectorizer and LogisticRegression

In [None]:
parameters = { #'vectorizer__max_features': (10**2, 10**3),
              'vectorizer__ngram_range': [(1, 2),(1, 3)],
              'vectorizer__min_df': [0.,.2,.4,.6,.8,1],
              'classifier__penalty': ('l1','l2'),
              'classifier__C': (range(1,10,2)),
             }

In [None]:
clf_pipeline_LogitReg = Pipeline(
            [("vectorizer", TfidfVectorizer()),
            ("classifier", LogisticRegression())]
        )

In [None]:
def plotROC(y_test, probs, titl=''):
    if titl!='':
        titl = ' ('+titl+')' 
    fpr, tpr, threshold = metrics.roc_curve(y_test, probs)
    roc_auc = metrics.auc(fpr, tpr)
    plt.title('Receiver Operating Characteristic'+titl)
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.5f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    return None

In [None]:
%%time
rndgs_clf_LogitReg = RandomizedSearchCV(clf_pipeline_LogitReg, parameters, scoring='f1', cv = 4, n_jobs=-1)
rndgs_clf_LogitReg.fit(X_train, y_train)

probs = rndgs_clf_LogitReg.predict_proba(X_train)[:,1]
plotROC(y_train, probs, 'Train')

probs = rndgs_clf_LogitReg.predict_proba(X_test)[:,1]
plotROC(y_test, probs, 'Test')

Looks like overfited model. There's too high AUC on train.

In [None]:
plotConfusionMatrix(metrics.confusion_matrix(y_test, rndgs_clf_LogitReg.predict(X_test)))

In [None]:
print(metrics.classification_report(y_test, rndgs_clf_LogitReg.predict(X_test)))
f1_rndLogR = metrics.f1_score(y_test, rndgs_clf_LogitReg.predict(X_test))

### Add more parameters

In [None]:
%%time
parameters = {'vectorizer__max_features': (10**2, 10**3),
              'vectorizer__ngram_range': [(1, 2),(1, 3)],
              'vectorizer__min_df': [0.,.2,.4,.6,.8,1],
              'classifier__penalty': ('l1','l2'),
              'classifier__C': (range(1,10,2)),
             }

clf_pipeline_LogitReg = Pipeline(
            [("vectorizer", TfidfVectorizer()),
            ("classifier", LogisticRegression())]
        )

rndgs_clf_LogitReg = RandomizedSearchCV(clf_pipeline_LogitReg, parameters, scoring='f1', cv = 4, n_jobs=-1)
rndgs_clf_LogitReg.fit(X_train, y_train)

probs = rndgs_clf_LogitReg.predict_proba(X_train)[:,1]
plotROC(y_train, probs, 'Train')

probs = rndgs_clf_LogitReg.predict_proba(X_test)[:,1]
plotROC(y_test, probs, 'Test')

In [None]:
print(metrics.classification_report(y_test, rndgs_clf_LogitReg.predict(X_test)))
f1_rndLogR_2 = metrics.f1_score(y_test, rndgs_clf_LogitReg.predict(X_test))

In [None]:
pd.DataFrame([f1_base, f1_gsLSVC, f1_rndLogR, f1_rndLogR_2], index=['BaseLine', 'GS_LSVC', 'rndGS_LogR', 'rndGS_LogR_Adj'], columns=['f1 score'])

## Baseline algorithm looks simple and good enough. The Confusion Matrix isn't bad too. Howewer, the Logistic Regression forecasts probubilities of classification. Hence, we are able to plot Reciever Operator Curve (ROC) and get AUC (Area under Curve) value. It may be a useful for the next improvements of this notebook.