In [75]:
import os
import re
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
stop_words = stopwords.words("russian")
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

In [76]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [77]:
import zipfile

with zipfile.ZipFile('/content/gdrive/MyDrive/Colab Notebooks/corpus_news.zip', 'r') as zip:
    zip.extractall('corpus_news')

In [78]:
def ziptodoc(folder):
    doc = []
    for filename in os.listdir(folder):
        with open(os.path.join(folder, filename), 'r', encoding='utf-8') as f:
            try:
              text = f.read()
              doc.append(text)
            except UnicodeDecodeError:
              pass
    return doc

culture = ziptodoc('corpus_news/corpus/culture')
tech = ziptodoc('corpus_news/corpus/hi-tech')
politics = ziptodoc('corpus_news/corpus/politics')
science = ziptodoc('corpus_news/corpus/science')

In [79]:
data = pd.concat([pd.DataFrame({'text': culture, 'class': 'culture'}),
                 pd.DataFrame({'text': tech, 'class': 'tech'}),
                 pd.DataFrame({'text': politics, 'class': 'politics'}),
                 pd.DataFrame({'text': science, 'class': 'science'})])

In [80]:
def clean(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    clean_text = []
    for word in tokens:
        if word not in stop_words:
            word = morph.parse(word)[0].normal_form
            clean_text.append(word)
    clean_text = ' '.join(clean_text) 
    
    return clean_text 

In [81]:
data['text'] = data['text'].apply(clean)

In [82]:
data.head()

Unnamed: 0,text,class
0,`` вид жизнь '' : главный веха человеческий пу...,culture
1,учёный признать опасный здоровье запойный прос...,culture
2,четверо российский артист попасть чёрный списо...,culture
3,следок : дмитрий марьянов колоть сильный препа...,culture
4,александр галибин объясниться жена любовь стек...,culture


DECISION TREE

In [85]:
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

X_train, X_test, y_train, y_test = train_test_split(data['text'], data['class'], test_size=0.1, random_state=42)

text_clf_tree = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', tree.DecisionTreeClassifier()),
                     ])

text_clf_tree.fit(X_train, y_train)


predicted_tree = text_clf_tree.predict(X_test)

print(metrics.classification_report(y_test, predicted_tree))

              precision    recall  f1-score   support

     culture       0.79      0.69      0.73        16
    politics       0.92      0.90      0.91        50
     science       0.72      0.87      0.79        15
        tech       1.00      1.00      1.00        25

    accuracy                           0.89       106
   macro avg       0.86      0.86      0.86       106
weighted avg       0.89      0.89      0.89       106



RANDOM FOREST

In [86]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['text'], data['class'], test_size=0.1, random_state=42)

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier(n_estimators=100)),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

     culture       0.94      0.94      0.94        16
    politics       0.98      0.98      0.98        50
     science       1.00      1.00      1.00        15
        tech       1.00      1.00      1.00        25

    accuracy                           0.98       106
   macro avg       0.98      0.98      0.98       106
weighted avg       0.98      0.98      0.98       106

