In [1]:
# Nhập thư viện
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import time

import warnings
warnings.filterwarnings('ignore')

# Đọc bộ dữ liệu
corpus = pd.read_csv('data/cleaned_mhc.csv')

# Chuẩn bị dữ liệu
X_train, X_test, y_train, y_test = train_test_split(
    corpus['text'],
    corpus['label'],
    test_size=0.2,
    random_state=42
)

tfidf = TfidfVectorizer(max_features=3500)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

n_components = 100

lsa = TruncatedSVD(n_components=n_components, random_state=42)
X_train_lsa = lsa.fit_transform(X_train_tfidf)
X_test_lsa = lsa.transform(X_test_tfidf)

print(f"TF-IDF - Train shape: {X_train_tfidf.shape}, Test Shape: {X_test_tfidf.shape}")
print(f"LSA - Train shape: {X_train_lsa.shape}, Test Shape: {X_test_lsa.shape}")

TF-IDF - Train shape: (18592, 3500), Test Shape: (4648, 3500)
LSA - Train shape: (18592, 100), Test Shape: (4648, 100)


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

C = 10
penalty='l2'
max_iter=5000

log_reg_tfidf = LogisticRegression(penalty=penalty, C=C, random_state=42, max_iter=max_iter)
log_reg_lsa = LogisticRegression(penalty=penalty, C=C, random_state=42, max_iter=max_iter)

metrics = {
    'Model': ['TF-IDF', 'LSA'],
    'Train Accuracy': [],
    'Test Accuracy': [],
    'F1 Score': [],
    'Precision': [],
    'Recall': [],
    'Fit Time (s)': [],
    'Test Time (s)': [],
    'Confusion Matrix': []
}

start_fit = time.time()
log_reg_tfidf.fit(X_train_tfidf, y_train)
fit_time = time.time() - start_fit

start_test = time.time()
y_train_pred = log_reg_tfidf.predict(X_train_tfidf)
y_test_pred = log_reg_tfidf.predict(X_test_tfidf)
test_time = time.time() - start_test

metrics['Train Accuracy'].append(accuracy_score(y_train, y_train_pred))
metrics['Test Accuracy'].append(accuracy_score(y_test, y_test_pred))
metrics['F1 Score'].append(f1_score(y_test, y_test_pred))
metrics['Precision'].append(precision_score(y_test, y_test_pred))
metrics['Recall'].append(recall_score(y_test, y_test_pred))
metrics['Fit Time (s)'].append(fit_time)
metrics['Test Time (s)'].append(test_time)
metrics['Confusion Matrix'].append(confusion_matrix(y_test, y_test_pred))

start_fit = time.time()
log_reg_lsa.fit(X_train_lsa, y_train)
fit_time = time.time() - start_fit

start_test = time.time()
y_train_pred = log_reg_lsa.predict(X_train_lsa)
y_test_pred = log_reg_lsa.predict(X_test_lsa)
test_time = time.time() - start_test

metrics['Train Accuracy'].append(accuracy_score(y_train, y_train_pred))
metrics['Test Accuracy'].append(accuracy_score(y_test, y_test_pred))
metrics['F1 Score'].append(f1_score(y_test, y_test_pred))
metrics['Precision'].append(precision_score(y_test, y_test_pred))
metrics['Recall'].append(recall_score(y_test, y_test_pred))
metrics['Fit Time (s)'].append(fit_time)
metrics['Test Time (s)'].append(test_time)
metrics['Confusion Matrix'].append(confusion_matrix(y_test, y_test_pred))

import pandas as pd

results_df = pd.DataFrame(metrics)
print(results_df)

print("\nConfusion Matrices:")
print(f"TF-IDF Confusion Matrix:\n{metrics['Confusion Matrix'][0]}")
print(f"LSA Confusion Matrix:\n{metrics['Confusion Matrix'][1]}")

    Model  Train Accuracy  Test Accuracy  F1 Score  Precision    Recall  \
0  TF-IDF        0.960682       0.920611  0.927377   0.930490  0.924284   
1     LSA        0.912328       0.910499  0.918046   0.922042  0.914084   

   Fit Time (s)  Test Time (s)            Confusion Matrix  
0      0.049736          0.001  [[1923, 176], [193, 2356]]  
1      0.013000          0.002  [[1902, 197], [219, 2330]]  

Confusion Matrices:
TF-IDF Confusion Matrix:
[[1923  176]
 [ 193 2356]]
LSA Confusion Matrix:
[[1902  197]
 [ 219 2330]]


In [5]:
feature_names_tfidf = tfidf.get_feature_names_out()

k = 40

coefficients_tfidf = log_reg_tfidf.coef_[0]
top_k_tfidf_indices = np.argsort(np.abs(coefficients_tfidf))[-k:][::-1]
top_k_tfidf_terms = [(feature_names_tfidf[i], coefficients_tfidf[i]) for i in top_k_tfidf_indices]

print(f"Top {k} coefficients for TF-IDF model:")
for term, coef in top_k_tfidf_terms:
    print(f"{term}: {coef:.4f}")

h = 15

coefficients_lsa = log_reg_lsa.coef_[0]
top_h_lsa_indices = np.argsort(np.abs(coefficients_lsa))[-h:][::-1]
top_h_lsa_terms = [(f"Topic {i}", coefficients_lsa[i]) for i in top_h_lsa_indices]

print(f"\nTop {h} coefficients for LSA model:")
for topic, coef in top_h_lsa_terms:
    print(f"{topic}: {coef:.4f}")

Top 40 coefficients for TF-IDF model:
redflag: 13.5501
kill: 12.7129
film: -12.4743
suicidal: 10.0378
killing: 8.7361
movie: -8.6326
die: 8.1371
cannot: 7.7279
pill: 7.2072
life: 7.1283
yall: -6.8403
alive: 6.8078
suicide: 6.7607
depression: 6.5942
minecraft: -6.5197
tldr: -6.4792
bruh: -6.4180
crush: -6.3409
medication: 6.2080
rope: 6.2011
covid: -6.1510
anymore: 6.0792
miserable: 6.0275
kinda: -6.0032
living: 5.9308
gotta: -5.9059
jump: 5.8981
teenager: -5.7771
myself: 5.5877
debt: 5.5788
ending: 5.5342
falling: 5.5098
depressed: 5.4928
meme: -5.4091
planning: 5.3554
award: -5.3143
scream: 5.1572
worse: 5.1553
que: 5.1341
job: 5.0729

Top 15 coefficients for LSA model:
Topic 4: -22.6940
Topic 1: -21.4265
Topic 0: 18.1120
Topic 2: -17.4899
Topic 6: -12.3280
Topic 12: 9.7850
Topic 46: 8.9137
Topic 8: -8.2117
Topic 25: -6.7750
Topic 22: -6.7008
Topic 43: -6.4024
Topic 20: 5.3073
Topic 64: 5.0967
Topic 41: -5.0431
Topic 71: 4.6989


In [4]:
print("LogReg TFIDF Hyperparameters:")
print(log_reg_tfidf.get_params())

print("\nLogReg LSA Hyperparameters:")
print(log_reg_lsa.get_params())

LogReg TFIDF Hyperparameters:
{'C': 10, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': 42, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}

LogReg LSA Hyperparameters:
{'C': 10, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': 42, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
