# 1. Подготовка

In [1]:
import pandas as pd
import re
import sys
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import nltk
from nltk.corpus import stopwords as nltk_stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import transformers
import torch

Выгрузим данные из csv и взглянем на них

In [2]:
df_tweets = pd.read_csv('/datasets/toxic_comments.csv')

In [3]:
df_tweets

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0
...,...,...
159566,""":::::And for the second time of asking, when ...",0
159567,You should be ashamed of yourself \n\nThat is ...,0
159568,"Spitzer \n\nUmm, theres no actual article for ...",0
159569,And it looks like it was actually you who put ...,0


Видим, что данным необходима предобработка.  
В первую очередь необходимо избавится от спецсимволов, оставив только слова, привести текст к нижнему регистру, а также лемматизировать слова.  
Выделим метод get_wordnet_pos для получение правильного pos_tag, а в методе clear_text очистим текст от спецсимволов и лемматизируем каждое слово.

In [4]:
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('stopwords', quiet=True)
stopwords = set(nltk_stopwords.words('english'))

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clear_text(text):
    return re.sub(r"[^a-zA-Z\' ]", ' ', text.lower())

def stem_text(text):
    return " ".join([porter.stem(w) for w in text.split()])

def lemm_text(text):
    text_pos = nltk.pos_tag(text.split())
    #meanfull = [x for x in text_pos if x[1].startswith('J') | x[1].startswith('V') | x[1].startswith('N') | x[1].startswith('R')]
    return " ".join([lemmatizer.lemmatize(w[0], get_wordnet_pos(w[1])) for w in text_pos])

porter = PorterStemmer()
lemmatizer = WordNetLemmatizer()

Применим clear_text к массиву тексту.

In [5]:
%%time
df_tweets["clear"] = df_tweets["text"].apply(lambda x: clear_text(x))

CPU times: user 1.67 s, sys: 52 ms, total: 1.72 s
Wall time: 1.72 s


In [6]:
%%time
#необходимо для корректной работы PorterStemmer
sys.setrecursionlimit(10000)
df_tweets["stem"] = df_tweets["clear"].apply(lambda x: stem_text(x))

CPU times: user 4min 9s, sys: 872 ms, total: 4min 10s
Wall time: 4min 14s


In [7]:
%%time
df_tweets["lemm"] = df_tweets["clear"].apply(lambda x: lemm_text(x))

CPU times: user 11min 3s, sys: 4.78 s, total: 11min 8s
Wall time: 11min 12s


Посмотрим, что получилось

In [8]:
df_tweets

Unnamed: 0,text,toxic,clear,stem,lemm
0,Explanation\nWhy the edits made under my usern...,0,explanation why the edits made under my userna...,explan whi the edit made under my usernam hard...,explanation why the edits make under my userna...
1,D'aww! He matches this background colour I'm s...,0,d'aww he matches this background colour i'm s...,d'aww he match thi background colour i'm seemi...,d'aww he match this background colour i'm seem...
2,"Hey man, I'm really not trying to edit war. It...",0,hey man i'm really not trying to edit war it...,hey man i'm realli not tri to edit war it' jus...,hey man i'm really not try to edit war it's ju...
3,"""\nMore\nI can't make any real suggestions on ...",0,more i can't make any real suggestions on im...,more i can't make ani real suggest on improv i...,more i can't make any real suggestion on impro...
4,"You, sir, are my hero. Any chance you remember...",0,you sir are my hero any chance you remember...,you sir are my hero ani chanc you rememb what ...,you sir be my hero any chance you remember wha...
...,...,...,...,...,...
159566,""":::::And for the second time of asking, when ...",0,and for the second time of asking when ...,and for the second time of ask when your view ...,and for the second time of ask when your view ...
159567,You should be ashamed of yourself \n\nThat is ...,0,you should be ashamed of yourself that is a ...,you should be asham of yourself that is a horr...,you should be ashamed of yourself that be a ho...
159568,"Spitzer \n\nUmm, theres no actual article for ...",0,spitzer umm theres no actual article for pr...,spitzer umm there no actual articl for prostit...,spitzer umm theres no actual article for prost...
159569,And it looks like it was actually you who put ...,0,and it looks like it was actually you who put ...,and it look like it wa actual you who put on t...,and it look like it be actually you who put on...


Заметим, что после преобразований стемматизированная версия мало отличается от лемматизированной. Вероятно и скор полученный на основе этих колонок будет мало отличаться.  
Все ок, можем переходить к обучению.

# 2. Обучение

Выделим feature и target, а также разобьем выборку на тренировочный, тестовый и финальный сет

In [9]:
features = df_tweets.drop(['toxic'], axis=1)
target = df_tweets['toxic']

features_train, features_test, target_train, target_test = train_test_split(features, 
                                                                              target, 
                                                                              test_size=0.2, 
                                                                              random_state = 12345)


features_final, features_test, target_final, target_test = train_test_split(features_test, 
                                                                              target_test, 
                                                                              test_size=0.5, 
                                                                              random_state = 12345)

Проведем векторизацию текста на обучающей выборке

In [10]:
word_vectorizer = TfidfVectorizer(
    stop_words=stopwords,
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(features_train["lemm"])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=10000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)

In [11]:
train_word_features = word_vectorizer.transform(features_train["lemm"].values.astype('U'))
test_word_features = word_vectorizer.transform(features_test["lemm"].values.astype('U'))

Получим наилучшие параметры для LogisticRegression с помощью GridSearchCV

In [12]:
pipe = Pipeline([('classifier' , LogisticRegression())])

param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(-4, 4, 10),
    'classifier__solver' : ['liblinear']}
]

clf = GridSearchCV(pipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)
best_clf = clf.fit(train_word_features, target_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 33.2min finished


In [13]:
best_clf.best_params_

{'classifier': LogisticRegression(C=2.782559402207126, class_weight=None, dual=False,
                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                    max_iter=100, multi_class='warn', n_jobs=None, penalty='l1',
                    random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                    warm_start=False),
 'classifier__C': 2.782559402207126,
 'classifier__penalty': 'l1',
 'classifier__solver': 'liblinear'}

Построим модель на основе полученных параметров.

In [14]:
model = LogisticRegression(C=2.782559402207126, class_weight=None, dual=False,
                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                    max_iter=100, multi_class='warn', n_jobs=None, penalty='l1',
                    random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                    warm_start=False)

In [15]:
model.fit(train_word_features, target_train)

LogisticRegression(C=2.782559402207126, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
pred = model.predict(test_word_features)
scores = f1_score(pred, target_test)
scores

0.7847151142954624

Полученный скор нас вполне устроит, можем переходить к проверке.

# 3. Выводы

Попробуем применить полученную модель к выборке features_final в разных ситуациях  

In [18]:
def calculate_f1_score(text, field):
    features = word_vectorizer.transform(features_final[field].values.astype('U'))
    pred = model.predict(features)
    print(text, f1_score(pred, target_final))


In [19]:
calculate_f1_score("RegExp","clear")

RegExp 0.7671794871794873


In [20]:
calculate_f1_score("Stemmatization", "stem")

Stemmatization 0.7671601615074023


In [21]:
calculate_f1_score("Lemmmatization", "lemm")

Lemmmatization 0.7810810810810811


Простое отсечение спецсимволов достоточно для получение хорошего скора, однако лемматизация помогает строить более точные модели.  
Как оказалось, стемматизация при помощи PorterStemmer дает результат хуже, чем совсем без него, вероятно стоит в дальнейшем оставаться при лемматизации.