# Проект для Интернет-магазина

# Описание проекта

Интернет-магазин  запускает новый сервис. Теперь пользователи могут редактировать и дополнять описания товаров, как в вики-сообществах. То есть клиенты предлагают свои правки и комментируют изменения других. Магазину нужен инструмент, который будет искать токсичные комментарии и отправлять их на модерацию.

Столбец text содержит текст комментария, а toxic — целевой признак.

In [12]:
import pandas as pd
import nltk
nltk.download('wordnet')
import re
import nltk
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Подготовка данных

In [4]:
df = pd.read_csv('/datasets/toxic_comments.csv')

In [5]:
df.head(30)

Unnamed: 0.1,Unnamed: 0,text,toxic
0,0,Explanation\nWhy the edits made under my usern...,0
1,1,D'aww! He matches this background colour I'm s...,0
2,2,"Hey man, I'm really not trying to edit war. It...",0
3,3,"""\nMore\nI can't make any real suggestions on ...",0
4,4,"You, sir, are my hero. Any chance you remember...",0
5,5,"""\n\nCongratulations from me as well, use the ...",0
6,6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1
7,7,Your vandalism to the Matt Shirvington article...,0
8,8,Sorry if the word 'nonsense' was offensive to ...,0
9,9,alignment on this subject and which are contra...,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159292 entries, 0 to 159291
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  159292 non-null  int64 
 1   text        159292 non-null  object
 2   toxic       159292 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.6+ MB


In [8]:
#очистка текста
def clear_text(text):
    text = re.sub(r'(?:\n|\r)',' ', text)
    text = re.sub(r'[^a-zA-Z ]+',' ', text).strip()
    text = text.lower()
    
    return text

In [9]:
df['text'] = df['text'].apply(clear_text)

In [10]:
corpus = df['text'].values

In [15]:
#лемматизация
WNL = WordNetLemmatizer()
def lemmatize(text):
    text_new = []
    for sentence in corpus:
        word_list = nltk.word_tokenize(sentence)
        text_new.append(' '.join([WNL.lemmatize(w) for w in word_list]))
    return text_new

In [16]:
corpus[0] = lemmatize(corpus[0])

In [19]:
df['text'] = corpus[0]

In [21]:
df = df.drop_duplicates()

In [22]:
df.head(30)

Unnamed: 0.1,Unnamed: 0,text,toxic
0,0,explanation why the edits made under my userna...,0
1,1,d aww he match this background colour i m seem...,0
2,2,hey man i m really not trying to edit war it s...,0
3,3,more i can t make any real suggestion on impro...,0
4,4,you sir are my hero any chance you remember wh...,0
5,5,congratulation from me a well use the tool wel...,0
6,6,cocksucker before you piss around on my work,1
7,7,your vandalism to the matt shirvington article...,0
8,8,sorry if the word nonsense wa offensive to you...,0
9,9,alignment on this subject and which are contra...,0


In [23]:
#разделение на выборки
train, test = train_test_split(df, train_size = 0.9, shuffle=False, random_state = 42)
test, valid = train_test_split(test, test_size = 0.5, shuffle=False, random_state = 42)

In [24]:
print(train.shape)
print(test.shape)
print(valid.shape)

(143362, 3)
(7965, 3)
(7965, 3)


In [None]:
target_train = train['toxic']
target_test = test['toxic']
target_valid = valid['toxic']

feature_train = train['text']
feature_test = test['text']
feature_valid = valid['text']

In [None]:
#стоп слова
stopwords = set(nltk_stopwords.words('english'))

In [None]:
#TFIDF векторизация
count_tf_idf = TfidfVectorizer(stop_words=stopwords)
tfidf_train = count_tf_idf.fit_transform(feature_train)
tfidf_test = count_tf_idf.transform(feature_test)
tfidf_valid = count_tf_idf.transform(feature_valid)

# Обучение

In [None]:
%%time
#LogisticRegression
pipe = Pipeline([
    (
    ('model', LogisticRegression(random_state=1, solver='liblinear', max_iter=200))
    )
    ])


param_LR = [
        {

            'model': [LogisticRegression(random_state=42, solver='liblinear')],
            'model__penalty' : ['l1', 'l2'],
            'model__C': list(range(1,15,3))
        }
]
grid = GridSearchCV(pipe, param_grid=param_LR, scoring='f1', cv=3, verbose=True, n_jobs=-1)
best_grid = grid.fit(tfidf_train, target_train)
print('Best parameters is:', grid.best_params_)
print('Best score is:', grid.best_score_)

In [None]:
%%time
#RandomForestClassifier
RFC = RandomForestClassifier(random_state=12345)
params_RF = {
    'n_estimators': list(range(50,300,50)),
    'max_depth':[5,15],
    'max_features' : list(range(1,20, 2))
}

                                 
grid = GridSearchCV(RFC, param_grid=params_RF, scoring='f1', cv=3, verbose=True, n_jobs=-1)
best_grid = grid.fit(tfidf_train, target_train)
print('Best parameters is:', grid.best_params_)
print('Best score is:', grid.best_score_)

## Тестирование

In [None]:
model = LogisticRegression(C=4, penalty='l1', random_state=42, solver='liblinear')
model.fit(tfidf_valid, target_valid)
test_pred = model.predict(tfidf_test)
f1_score(target_test, test_pred)

LogisticRegression с параметрами C=4, penalty='l1', random_state=42, solver='liblinear' на тестовой выборке показывает значение f1 = 0.752

# Вывод

В данном проекте был создан инструмент, который будет искать токсичные комментарии и отправлять их на модерацию.

В ходе проекта была проведена очистка текста и его лемматизация.

Обучены 2 модели RandomForestClassifier и LogisticRegression. На обучающей выборке наилучшие показатели были достигнуты с помощью модели LogisticRegression.

На тестовой выборке с помощью LogisticRegression с параметрами C=4, penalty='l1', random_state=42, solver='liblinear' удалось достигнуть значения F1 = 0.752