## Подготовка

загружаем библиотеки

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from pymystem3 import Mystem
import re
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, train_test_split
from sklearn import linear_model
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor 
from sklearn.metrics import f1_score
import pymorphy2
import nltk
from nltk.corpus import stopwords as nltk_stopwords
nltk.download('stopwords')
stopwords = set(nltk_stopwords.words('english'))
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


загружаем файл и смотрим его

In [2]:
toxic = pd.read_csv('c:\\123\\toxic_comments.csv')
toxic.head()

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


смотрим информацию по файлу

In [3]:
toxic.shape

(159571, 2)

In [4]:
toxic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159571 non-null  object
 1   toxic   159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [5]:
toxic['toxic'].value_counts()

0    143346
1     16225
Name: toxic, dtype: int64

видим разбивку, по нет понимания, надо ли уравновешивать классы? 

In [6]:
corpus = toxic['text'] # создадим корпус

In [7]:
morph = pymorphy2.MorphAnalyzer()
# разбиваем текст на слова и избавляемся от пробелов
def lemmatize(text):
    words = text.split() 
    res = list()
    for word in words:
        p = morph.parse(word)[0]
        res.append(p.normal_form)

    return " ".join(res)

def clear_text(text):
    cyrillic_text = re.sub(r'[^a-zA-Z]', " ", text)
    
    return " ".join(cyrillic_text.split())
   

In [8]:
lemmatized_text = corpus.apply(lambda x: lemmatize(clear_text(x)) )
lemmatized_text[:3]

0    explanation why the edits made under my userna...
1    d aww he matches this background colour i m se...
2    hey man i m really not trying to edit war it s...
Name: text, dtype: object

создаем выборки

In [9]:
X_train, X_test, y_train, y_test = train_test_split(lemmatized_text, toxic['toxic'])

создадим два мешка слов

In [10]:
count_vect = CountVectorizer(stop_words=stopwords)
n_gramm_train = count_vect.fit_transform(X_train)
n_gramm_test = count_vect.transform(X_test)

print("Размер train'a:", n_gramm_train.shape)
print("Размер test'a:", n_gramm_test.shape)

Размер train'a: (119678, 142497)
Размер test'a: (39893, 142497)


In [11]:
Tf_Idf_count = TfidfVectorizer(stop_words=stopwords, ngram_range=(1, 2))
n_gramm_train_tf = Tf_Idf_count.fit_transform(X_train)
n_gramm_test_tf = Tf_Idf_count.transform(X_test)

print("Размер train'a:", n_gramm_train_tf.shape)
print("Размер test'a:", n_gramm_test_tf.shape)

Размер train'a: (119678, 2271968)
Размер test'a: (39893, 2271968)


## Обучение

обучаем разнные модели

In [12]:
lr = LogisticRegression(random_state=12345, class_weight='balanced')
lr.fit(n_gramm_train, y_train)
print("train:", lr.score(n_gramm_train, y_train))
print("test:", lr.score(n_gramm_test, y_test))
f1_lr = f1_score(y_test, lr.predict(n_gramm_test))
print("\nF1:", f1_lr)

train: 0.9751750530590417
test: 0.9473341187677036

F1: 0.7611685801977947


In [13]:
lr = LogisticRegression(random_state=12345, class_weight='balanced')
lr.fit(n_gramm_train_tf, y_train)
print("train:", lr.score(n_gramm_train_tf, y_train))
print("test:", lr.score(n_gramm_test_tf, y_test))
f1_lr_tf = f1_score(y_test, lr.predict(n_gramm_test_tf))
print("\nF1:", f1_lr_tf)

train: 0.9797874296027674
test: 0.9432983230140626

F1: 0.747375474648202


In [14]:
rfc = RandomForestClassifier(random_state=12345, class_weight='balanced')
rfc.fit(n_gramm_train, y_train)
print("train:", rfc.score(n_gramm_train, y_train))
print("test:", rfc.score(n_gramm_test, y_test))
f1_rfc = f1_score(y_test, rfc.predict(n_gramm_test))
print("\nF1:", f1_rfc)

train: 0.9997409716071458
test: 0.9459052966685885

F1: 0.6479608482871126


In [15]:
rfc = RandomForestClassifier(random_state=12345, class_weight='balanced')
rfc.fit(n_gramm_train_tf, y_train)
print("train:", rfc.score(n_gramm_train_tf, y_train))
print("test:", rfc.score(n_gramm_test_tf, y_test))
f1_rfc_tf = f1_score(y_test, rfc.predict(n_gramm_test_tf))
print("\nF1:", f1_rfc_tf)

train: 0.999749327361754
test: 0.9376582357807134

F1: 0.5586512866015971


## Выводы

In [25]:
model = []
mod_lr = [f1_lr, f1_lr_tf]
mod_rfc = [f1_rfc, f1_rfc_tf]
model.append(mod_lr)
model.append(mod_rfc)

In [27]:
final_models = pd.DataFrame(data=model, index=['LogisticRegression', 'RandomForestClassifier'], columns=['CountVectorizer', 'TfidfVectorizer'])

In [28]:
final_models

Unnamed: 0,CountVectorizer,TfidfVectorizer
LogisticRegression,0.761169,0.747375
RandomForestClassifier,0.647961,0.558651


наивысшая метрика f1 достигнута при применении класса CountVectorizer, на модели LogisticRegression