In [1]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [2]:
df = pd.DataFrame()
for chunk in pd.read_csv('lenta-ru-news.csv', sep=',', chunksize=10000):
    df = pd.concat([df, chunk], ignore_index=True)
    

In [3]:
df=df.dropna()

In [75]:
df.groupby('topic').count()

Unnamed: 0_level_0,url,title,text,tags,date
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
69-я параллель,1268,1268,1268,1268,1268
Библиотека,65,65,65,65,65
Бизнес,7375,7375,7375,7375,7375
Бывший СССР,51370,51370,51370,51370,51370
Дом,21734,21734,21734,21734,21734
Из жизни,27513,27513,27513,27513,27513
Интернет и СМИ,44421,44421,44421,44421,44421
Крым,666,666,666,666,666
Культпросвет,340,340,340,340,340
Культура,53530,53530,53530,53530,53530


In [4]:
df_topic_index = df.set_index('topic')
df_drop = df_topic_index.drop(['69-я параллель', 'Библиотека', 'Бывший СССР', 'Дом', 'Из жизни', 'Крым', 
                               'МедНовости', 'Оружие', 'Путешествия', 'Силовые структуры', 'Сочи', 
                               'Ценности', 'ЧМ-2014', 'Культпросвет ', 'Легпром', 'Экономика'], inplace=False)

In [5]:
df_index = df_drop.reset_index()
training_data = df_index.drop(['url', 'tags', 'date'], axis=1)
training_data['topic'] = training_data['topic'].astype('category')
training_data['topic'] = training_data['topic'].cat.codes
training_data.rename(columns={'topic':'flag'}, inplace=True)
training_data_title = training_data.drop('text', axis=1)
training_data_text = training_data.drop('title', axis=1)

training_data_title['title']  = training_data_title['title'].replace({'"':''}, regex=True)
training_data_text['text']  = training_data_text['text'].replace({'"':''}, regex=True)

training_data_title.to_csv('training_data_title.csv', sep=',', encoding='utf-8')
training_data_text.to_csv('training_data_text.csv', sep=',', encoding='utf-8')

In [6]:
count_vect_title = CountVectorizer()
count_vect_text = CountVectorizer()

x_title_train_counts = count_vect_title.fit_transform(training_data_title.title)
x_text_train_counts = count_vect_text.fit_transform(training_data_text.text)

pickle.dump(count_vect_title.vocabulary_, open("count_vector_title.pkl","wb"))
pickle.dump(count_vect_text.vocabulary_, open("count_vector_text.pkl","wb"))

In [7]:
tfidf_transformer_title = TfidfTransformer()
tfidf_transformer_text = TfidfTransformer()

x_title_train_tfidf = tfidf_transformer_title.fit_transform(x_title_train_counts)
x_text_train_tfidf = tfidf_transformer_text.fit_transform(x_text_train_counts)

pickle.dump(tfidf_transformer_title, open("tfidf_title.pkl","wb"))
pickle.dump(tfidf_transformer_text, open("tfidf_text.pkl","wb"))

In [8]:
clf_neural_title = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,), random_state=1, max_iter=200)
clf_neural_text = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,), random_state=1, max_iter=100)

x_title_train, x_title_test, y_title_train, y_title_test = train_test_split(x_title_train_tfidf, training_data_title.flag, test_size=0.25, random_state=42)
x_text_train, x_text_test, y_text_train, y_text_test = train_test_split(x_text_train_tfidf, training_data_text.flag, test_size=0.25, random_state=42)

In [9]:
clf_neural_title.fit(x_title_train, y_title_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(alpha=1e-05, hidden_layer_sizes=(15,), random_state=1,
              solver='lbfgs')

In [10]:
clf_neural_text.fit(x_text_train, y_text_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(alpha=1e-05, hidden_layer_sizes=(15,), max_iter=100,
              random_state=1, solver='lbfgs')

In [11]:
pickle.dump(clf_neural_title, open("softmax_title.pkl", "wb"))

In [12]:
pickle.dump(clf_neural_text, open("softmax_text.pkl", "wb"))

In [13]:
category_list = ['Бизнес', 'Интернет и СМИ', 'Культура', 'Мир', 'Наука и техника', 'Россия', 'Спорт']

In [14]:
predicted_title = clf_neural_title.predict(x_title_test)
result_softmax_title = pd.DataFrame( {'true_labels': y_title_test,'predicted_labels': predicted_title})
result_softmax_title.to_csv('res_softmax_title.csv', sep = ',')

score_title = accuracy_score(y_title_test, predicted_title)
print(score_title)  

0.8322768612937517


In [15]:
predicted_text = clf_neural_text.predict(x_text_test)
result_softmax_text = pd.DataFrame( {'true_labels': y_text_test,'predicted_labels': predicted_text})
result_softmax_text.to_csv('res_softmax_text.csv', sep = ',')

score_text = accuracy_score(y_text_test, predicted_text)
print(score_text)

0.874695854167487
