In [2]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import time
import tqdm

import matplotlib
matplotlib.use('Agg')

import numpy as np
import pandas as pd
tqdm.tqdm.pandas()

import matplotlib.pyplot as plt
import fasttext

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from mlxtend.plotting import plot_confusion_matrix

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

from sklearn.linear_model import LogisticRegression

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)


In [4]:
df = pd.read_csv("/mnt/d/work2/turkish-news-classification/data/cleaned.csv", usecols=["Haber Gövdesi Cleaned", "Sınıf"])
df = df.dropna()
df = df.reset_index(drop=True)

In [5]:
X = df["Haber Gövdesi Cleaned"]
y = df["Sınıf"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

In [7]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [8]:
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [9]:
rf = LogisticRegression()
rf_cv_train_start = time.time()
rf.fit(X_train_cv, y_train)
rf_cv_train_time = time.time() - rf_cv_train_start
print(f"CV + Logistic Regression Train Time = {rf_cv_train_time:.4f}")

CV + Logistic Regression Train Time = 443.5058


In [10]:
rf_cv_pred_train = rf.predict(X_train_cv)
rf_cv_test_start = time.time()
rf_cv_pred_test = rf.predict(X_test_cv)
rf_cv_test_time = time.time() - rf_cv_test_start

In [11]:
rf_cv_train_score = accuracy_score(rf_cv_pred_train, y_train)
rf_cv_test_score = accuracy_score(rf_cv_pred_test, y_test)
print(f"CV + Logistic Regression Train Score = {rf_cv_train_score * 100:.4f}%")
print(f"CV + Logistic Regression Test Score = {rf_cv_test_score * 100:.4f}%")
print(f"CV + Logistic Regression Test Time = {rf_cv_test_time:.4f}")

CV + Logistic Regression Train Score = 94.8085%
CV + Logistic Regression Test Score = 92.0756%
CV + Logistic Regression Test Time = 0.2033


In [12]:
rf_cv_precision_score = precision_score(y_test, rf_cv_pred_test, average='macro')
rf_cv_f1_score = f1_score(y_test, rf_cv_pred_test, average='macro')
rf_cv_recall_score = recall_score(y_test, rf_cv_pred_test, average='macro')
rf_cv_accuracy_score = accuracy_score(y_test, rf_cv_pred_test)

print(f"CV + Logistic Regression Precision Score = {rf_cv_precision_score * 100:.4f}%")
print(f"CV + Logistic Regression F1 Score = {rf_cv_f1_score * 100:.4f}%")
print(f"CV + Logistic Regression Recall Score = {rf_cv_recall_score * 100:.4f}%")
print(f"CV + Logistic Regression Accuracy Score = {rf_cv_accuracy_score * 100:.4f}%")

CV + Logistic Regression Precision Score = 87.4707%
CV + Logistic Regression F1 Score = 86.2169%
CV + Logistic Regression Recall Score = 85.0855%
CV + Logistic Regression Accuracy Score = 92.0756%


In [13]:
print(classification_report(y_test, rf_cv_pred_test, target_names=le.classes_))

                 precision    recall  f1-score   support

Bilim-Teknoloji       0.85      0.84      0.85      3051
 Finans-Ekonomi       0.84      0.81      0.82      3668
   Kültür-Sanat       0.82      0.77      0.79      4663
        Magazin       0.95      0.96      0.95     31592
         Sağlık       0.94      0.92      0.93      4786
        Siyaset       0.90      0.94      0.92     14726
           Spor       0.97      0.97      0.97     13144
         Turizm       0.85      0.81      0.83      2966
          Çevre       0.74      0.65      0.69      1561

       accuracy                           0.92     80157
      macro avg       0.87      0.85      0.86     80157
   weighted avg       0.92      0.92      0.92     80157



In [14]:
mnb_cv_cm = confusion_matrix(y_test, rf_cv_pred_test)
fig, ax = plot_confusion_matrix(conf_mat=mnb_cv_cm, show_absolute=True, show_normed=True, colorbar=True, class_names=le.classes_, figsize=(10, 10))
plt.savefig("./output/lr_cv.png")