In [1]:
import warnings
from sklearn.datasets import fetch_20newsgroups
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
categories = ['comp.windows.x', 'talk.politics.guns', 'talk.politics.misc']
remove = ('headers', 'footers', 'quotes')

twenty_train_full = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42, remove=remove)
twenty_test_full = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42, remove=remove)

#### Стемминг

In [4]:
import nltk
from nltk import word_tokenize
from nltk.stem import *
from nltk.stem import PorterStemmer

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
def stemming(data):
    porter_stemmer = PorterStemmer()
    stem = []
    for text in data:
        nltk_tokens = word_tokenize(text)
        line = ''.join([' ' + porter_stemmer.stem(word) for word in nltk_tokens])
        stem.append(line)
    return stem

In [6]:
stem_train = stemming(twenty_train_full.data)
stem_test = stemming(twenty_test_full.data)

##RF, KNN, LR

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [8]:
stop_words = [None, 'english']
max_features_values = [100, 500, 1000, 5000, 10000]
use_idf = [True, False]

In [9]:
rf_first = range(1, 5, 1)
rf_second = range(5, 100, 20)

rf_tree_max_depth = [*rf_first, *rf_second]

In [10]:


knn_parameters = {
    'vect__max_features': max_features_values,
    'vect__stop_words': stop_words,
    'tfidf__use_idf': use_idf,
    'clf__n_neighbors': [3, 5, 7, 9],
    'clf__metric': ['euclidean', 'manhattan'],
}

lr_parameters = {
    'vect__max_features': max_features_values,
    'vect__stop_words': stop_words,
    'tfidf__use_idf': use_idf,
    'clf__solver': ['newton-cg', 'lbfgs', 'sag', 'liblinear'],
    'clf__penalty': ['l2'],
}

rf_parameters = {
    'vect__max_features': max_features_values,
    'vect__stop_words': stop_words,
    'tfidf__use_idf': use_idf,
    'clf__n_estimators': [50, 100, 150],
    'clf__criterion': ['gini', 'entropy'],
    'clf__max_depth': rf_tree_max_depth,
}


In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [19]:
# Уменьшение объема данных
subset_size = 200
subset_data = twenty_train_full.data[:subset_size]
subset_target = twenty_train_full.target[:subset_size]

In [20]:
# Дерево решений (RF) без стемминга
text_clf_dt = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', DecisionTreeClassifier())])
gscv_dt = GridSearchCV(text_clf_dt, param_grid=dt_parameters, n_jobs=-1)
gscv_dt.fit(subset_data, subset_target)

#### С использованием стемминга

In [21]:
# Дерево решений (RF) со стеммингом
text_clf_dt_stem = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', DecisionTreeClassifier())])
gscv_dt_stem = GridSearchCV(text_clf_dt_stem, param_grid=dt_parameters, n_jobs=-1)
gscv_dt_stem.fit(stem_train[:subset_size], subset_target)

### К-ближайших соседей (KNN):
• количество ближайших соседей,
• метрика (евклидова, городских кварталов)

#### Без использования стэмминга

In [24]:
# К-ближайших соседей (KNN) без стемминга
text_clf_knn = Pipeline([('vect', CountVectorizer()),
                          ('tfidf', TfidfTransformer()),
                          ('clf', KNeighborsClassifier())])
gscv_knn = GridSearchCV(text_clf_knn, param_grid=knn_parameters, n_jobs=-1)
gscv_knn.fit(subset_data, subset_target)

#### С использованием стэмминга

In [25]:
# К-ближайших соседей (KNN) со стеммингом
text_clf_knn_stem = Pipeline([('vect', CountVectorizer()),
                               ('tfidf', TfidfTransformer()),
                               ('clf', KNeighborsClassifier())])
gscv_knn_stem = GridSearchCV(text_clf_knn_stem, param_grid=knn_parameters, n_jobs=-1)
gscv_knn_stem.fit(stem_train[:subset_size], subset_target)

### Логистическая регрессия (LR):
• метод нахождения экстремума (параметр solver: ‘newton-cg’, ‘lbfgs’,
‘sag’, ‘liblinear’),
• регуляризация (параметр penalty: ‘L1’, ‘L2’)

#### Без использования стемминга

In [26]:
# Логистическая регрессия (LR) без стемминга
text_clf_lr = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', LogisticRegression())])
gscv_lr = GridSearchCV(text_clf_lr, param_grid=lr_parameters, n_jobs=-1)
gscv_lr.fit(subset_data, subset_target)

#### С использованием стемминга

In [27]:
# Логистическая регрессия (LR) со стеммингом
text_clf_lr_stem = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', LogisticRegression())])
gscv_lr_stem = GridSearchCV(text_clf_lr_stem, param_grid=lr_parameters, n_jobs=-1)
gscv_lr_stem.fit(stem_train[:subset_size], subset_target)

### Вывод полученных результатов анализа

In [28]:
from sklearn.metrics import classification_report

In [29]:
predicted_dt = gscv_dt.predict(twenty_test_full.data)
predicted_knn = gscv_knn.predict(twenty_test_full.data)
predicted_lr = gscv_lr.predict(twenty_test_full.data)

predicted_dt_stem = gscv_dt_stem.predict(twenty_test_full.data)
predicted_knn_stem = gscv_knn_stem.predict(twenty_test_full.data)
predicted_lr_stem = gscv_lr_stem.predict(twenty_test_full.data)


In [30]:
print('Дерево решений (RF)\n')
print(classification_report(twenty_test_full.target, predicted_dt, target_names=categories))
print(gscv_dt.best_params_)

Дерево решений (RF)

                    precision    recall  f1-score   support

    comp.windows.x       0.72      0.53      0.61       395
talk.politics.guns       0.43      0.69      0.53       364
talk.politics.misc       0.42      0.26      0.32       310

          accuracy                           0.51      1069
         macro avg       0.52      0.49      0.49      1069
      weighted avg       0.53      0.51      0.50      1069

{'clf__criterion': 'entropy', 'clf__max_depth': 25, 'tfidf__use_idf': True, 'vect__max_features': 500, 'vect__stop_words': 'english'}


In [31]:
print('Дерево решений (RF) со стеммингом\n')
print(classification_report(twenty_test_full.target, predicted_dt_stem, target_names=categories))
print(gscv_dt_stem.best_params_)

Дерево решений (RF) со стеммингом

                    precision    recall  f1-score   support

    comp.windows.x       0.78      0.43      0.56       395
talk.politics.guns       0.40      0.73      0.52       364
talk.politics.misc       0.39      0.24      0.30       310

          accuracy                           0.48      1069
         macro avg       0.52      0.47      0.46      1069
      weighted avg       0.54      0.48      0.47      1069

{'clf__criterion': 'entropy', 'clf__max_depth': 85, 'tfidf__use_idf': False, 'vect__max_features': 5000, 'vect__stop_words': 'english'}


In [32]:
print('\nK-ближайших соседей (KNN)\n')
print(classification_report(twenty_test_full.target, predicted_knn, target_names=categories))
print(gscv_knn.best_params_)



K-ближайших соседей (KNN)

                    precision    recall  f1-score   support

    comp.windows.x       0.91      0.89      0.90       395
talk.politics.guns       0.54      0.93      0.68       364
talk.politics.misc       0.81      0.14      0.24       310

          accuracy                           0.69      1069
         macro avg       0.76      0.66      0.61      1069
      weighted avg       0.76      0.69      0.64      1069

{'clf__metric': 'euclidean', 'clf__n_neighbors': 9, 'tfidf__use_idf': True, 'vect__max_features': 10000, 'vect__stop_words': 'english'}


In [33]:
print('\nK-ближайших соседей (KNN) со стеммингом\n')
print(classification_report(twenty_test_full.target, predicted_knn_stem, target_names=categories))
print(gscv_knn_stem.best_params_)


K-ближайших соседей (KNN) со стеммингом

                    precision    recall  f1-score   support

    comp.windows.x       0.84      0.92      0.88       395
talk.politics.guns       0.54      0.87      0.66       364
talk.politics.misc       0.77      0.12      0.20       310

          accuracy                           0.67      1069
         macro avg       0.71      0.64      0.58      1069
      weighted avg       0.71      0.67      0.61      1069

{'clf__metric': 'euclidean', 'clf__n_neighbors': 9, 'tfidf__use_idf': True, 'vect__max_features': 5000, 'vect__stop_words': 'english'}


In [34]:
print('\nЛогистическая регрессия (LR)\n')
print(classification_report(twenty_test_full.target, predicted_lr, target_names=categories))
print(gscv_lr.best_params_)


Логистическая регрессия (LR)

                    precision    recall  f1-score   support

    comp.windows.x       0.78      0.97      0.86       395
talk.politics.guns       0.61      0.77      0.68       364
talk.politics.misc       0.73      0.27      0.40       310

          accuracy                           0.70      1069
         macro avg       0.71      0.67      0.65      1069
      weighted avg       0.71      0.70      0.67      1069

{'clf__penalty': 'l2', 'clf__solver': 'newton-cg', 'tfidf__use_idf': False, 'vect__max_features': 5000, 'vect__stop_words': 'english'}


In [35]:
print('\nЛогистическая регрессия (LR) со стеммингом\n')
print(classification_report(twenty_test_full.target, predicted_lr_stem, target_names=categories))
print(gscv_lr_stem.best_params_)


Логистическая регрессия (LR) со стеммингом

                    precision    recall  f1-score   support

    comp.windows.x       0.68      0.98      0.80       395
talk.politics.guns       0.60      0.71      0.65       364
talk.politics.misc       0.76      0.17      0.28       310

          accuracy                           0.65      1069
         macro avg       0.68      0.62      0.58      1069
      weighted avg       0.68      0.65      0.60      1069

{'clf__penalty': 'l2', 'clf__solver': 'newton-cg', 'tfidf__use_idf': True, 'vect__max_features': 5000, 'vect__stop_words': 'english'}


### Сравнительная таблица

In [36]:
import pandas as pd

In [37]:
# Создаем Excel-файл и записываем в него результаты для новых методов
writer = pd.ExcelWriter('result_updated.xlsx', engine='openpyxl')

# Дерево решений (RF) без стемминга
df_dt = pd.DataFrame(classification_report(predicted_dt, twenty_test_full.target, output_dict=True))
df_dt.to_excel(writer, sheet_name='RF без стемминга')

# Дерево решений (RF) со стеммингом
df_dt_stem = pd.DataFrame(classification_report(predicted_dt_stem, twenty_test_full.target, output_dict=True))
df_dt_stem.to_excel(writer, sheet_name='RF со стеммингом')

# K-ближайших соседей (KNN) без стемминга
df_knn = pd.DataFrame(classification_report(predicted_knn, twenty_test_full.target, output_dict=True))
df_knn.to_excel(writer, sheet_name='KNN без стемминга')

# K-ближайших соседей (KNN) со стеммингом
df_knn_stem = pd.DataFrame(classification_report(predicted_knn_stem, twenty_test_full.target, output_dict=True))
df_knn_stem.to_excel(writer, sheet_name='KNN со стеммингом')

# Логистическая регрессия (LR) без стемминга
df_lr = pd.DataFrame(classification_report(predicted_lr, twenty_test_full.target, output_dict=True))
df_lr.to_excel(writer, sheet_name='LR без стемминга')

# Логистическая регрессия (LR) со стеммингом
df_lr_stem = pd.DataFrame(classification_report(predicted_lr_stem, twenty_test_full.target, output_dict=True))
df_lr_stem.to_excel(writer, sheet_name='LR со стеммингом')

# Закрываем запись в Excel-файл
writer.close()
