In [1]:
from time import time
import numpy as np 
import pandas as pd 

import re
import nltk
from nltk.corpus import stopwords
from gensim.models import word2vec

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("labeled_rutoxic.csv", delimiter=',', header=0, names=['sentence', 'label'])

print('В наборе предложений: \n',df.shape[0])

В наборе предложений: 
 14412


In [3]:
print('toxic:', df[df['label'] > 0]['label'].count())
print('not toxic:', df[df['label'] < 1]['label'].count())

toxic: 4826
not toxic: 9586


In [4]:
X = df.iloc[:,0]# 
y = df.iloc[:,1]# 

train , test , y_train, y_test = train_test_split(X, y, test_size=0.3) # и отдаем 30% на тест, остальное на обучен

In [5]:
# Метод преобразования текста в слова
def text_to_words(raw_text, remove_stopwords=False):
   
    letters_only = re.sub("[^0-9а-яА-Я]", " ", raw_text)
    words = letters_only.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("russian"))
        meaningful_words = [w for w in words if not w in stops]
        words = meaningful_words
    return words 

sentences_train = train.apply(text_to_words, remove_stopwords=False)
sentences_test = test.apply(text_to_words, remove_stopwords=False)
print(sentences_train[:1])

3767    [50, человек, 33, машины, это, как, и, вообще,...
Name: sentence, dtype: object


In [6]:

num_features = 300                
min_word_count = 40                     
num_workers = 4      
context = 20                                                                                        
downsampling = 1e-3  
model = word2vec.Word2Vec(sentences_train, workers=num_workers, vector_size=num_features, min_count = min_word_count, window = context, sample = downsampling)

In [7]:
# получение векторного представления
def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0

    index2word_set = set(model.wv.index_to_key)
   
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1
            featureVec = np.add(featureVec, model.wv[word])

    if nwords == 0:
        nwords = 1
    featureVec = np.divide(featureVec, nwords)
    return featureVec

# получение среднего векторного простнраства для предложения
def getAvgFeatureVecs(reviews, model, num_features):
    reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype="float32")
    counter = 0
    for review in reviews:
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        counter = counter + 1
    return reviewFeatureVecs

f_matrix_train = getAvgFeatureVecs(sentences_train, model, num_features)
f_matrix_test = getAvgFeatureVecs(sentences_test, model, num_features)
print(f_matrix_train[0,:])

[ 4.60491218e-02  1.77925035e-01  2.84816958e-02  2.69680172e-02
  7.21297786e-02 -1.90532833e-01  1.30635783e-01  3.62971634e-01
  6.36379272e-02 -7.17774481e-02  3.03260088e-02 -1.08946085e-01
  3.43056321e-02 -4.92329989e-03 -1.46393836e-01 -1.26624539e-01
  6.58736750e-02  3.61233391e-02  1.03520649e-02 -1.39252484e-01
 -4.46592793e-02 -3.42850126e-02  1.22742869e-01  3.17343436e-02
  4.40374836e-02 -1.59052256e-02 -1.71727449e-01  3.02167479e-02
 -1.86473384e-01 -9.60635319e-02  5.11607155e-02 -1.74030662e-02
  8.04688483e-02 -6.51075840e-02 -3.55086289e-03  5.26156947e-02
  3.28429453e-02 -2.40130067e-01 -3.82895991e-02 -1.31824417e-02
 -4.80668768e-02  1.90671382e-03  9.16009396e-02 -1.08047739e-01
  1.34357899e-01  1.90455616e-01  1.52480826e-02  4.88234870e-02
 -3.47661925e-03  5.87041564e-02  5.14409132e-02  3.69630940e-02
 -1.25418454e-01  9.55058336e-02 -9.36210603e-02  6.29331917e-02
  6.45324290e-02 -1.60671603e-02  4.48068492e-02 -2.50088088e-02
 -6.34320378e-02 -1.50354

In [8]:
model = []

m = MLPClassifier(solver='adam', hidden_layer_sizes=(300,40,30), random_state=1)
model.append(m)

print(model)

[MLPClassifier(hidden_layer_sizes=(300, 40, 30), random_state=1)]


In [9]:
batch_size = 200
total_rows = f_matrix_train.shape[0]
duration = 0
start_train = time()
pos = 0
classes = [0.0, 1.0]
while duration < 10 and pos < total_rows:
    if pos+batch_size > total_rows:
        batch_size = total_rows-pos
    X_p = f_matrix_train[pos:pos+batch_size]
    y_p = y_train.values[pos:pos+batch_size]
    model[0].partial_fit(X_p, y_p, classes)
    pos = pos + batch_size
    duration = time() - start_train
    if pos == total_rows:
        pos = 0
        batch_size = 10000
print('done')

done


In [10]:
# Сохранение результатов и расчет ошибки

In [11]:
y_test_values=y_test.values
predicted_results = model[0].predict_proba(f_matrix_test)
predicted_results = np.where(predicted_results[:,0]>predicted_results[:,1], 0.0,1.0)

sum_errors =sum(y_test_values - predicted_results)
print('count test values', len(y_test_values))
print('sum_errors', sum_errors)


count test values 4324
sum_errors 498.0


In [12]:
saved_result = pd.DataFrame({'text':test.values,
 'expected':  y_test_values,
 'predicted': predicted_results})

In [13]:
saved_result.to_csv('result.csv', encoding='utf-8', index=False)