In [4]:
# Nhập thư viện
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import time

import warnings
warnings.filterwarnings('ignore')

# Đọc bộ dữ liệu
corpus = pd.read_csv('data/cleaned_mhc.csv')

# Chuẩn bị dữ liệu
X_train, X_test, y_train, y_test = train_test_split(
    corpus['text'],
    corpus['label'],
    test_size=0.2,
    random_state=42
)

tfidf = TfidfVectorizer(max_features=3500)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

n_components = 100

lsa = TruncatedSVD(n_components=n_components, random_state=42)
X_train_lsa = lsa.fit_transform(X_train_tfidf)
X_test_lsa = lsa.transform(X_test_tfidf)

print(f"TF-IDF - Train shape: {X_train_tfidf.shape}, Test Shape: {X_test_tfidf.shape}")
print(f"LSA - Train shape: {X_train_lsa.shape}, Test Shape: {X_test_lsa.shape}")

TF-IDF - Train shape: (18592, 3500), Test Shape: (4648, 3500)
LSA - Train shape: (18592, 100), Test Shape: (4648, 100)


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

C = 0.1
penalty='l2'
max_iter=1000

# log_reg_tfidf = LogisticRegression(penalty=penalty, C=C, random_state=42, max_iter=max_iter)
# log_reg_lsa = LogisticRegression(penalty=penalty, C=C, random_state=42, max_iter=max_iter)

log_reg_tfidf = LogisticRegression(random_state=42, max_iter=max_iter)
log_reg_lsa = LogisticRegression(random_state=42, max_iter=max_iter)

metrics = {
    'Model': ['TF-IDF', 'LSA'],
    'Train Accuracy': [],
    'Test Accuracy': [],
    'F1 Score': [],
    'Precision': [],
    'Recall': [],
    'Fit Time (s)': [],
    'Test Time (s)': [],
    'Confusion Matrix': []
}

start_fit = time.time()
log_reg_tfidf.fit(X_train_tfidf, y_train)
fit_time = time.time() - start_fit

start_test = time.time()
y_train_pred = log_reg_tfidf.predict(X_train_tfidf)
y_test_pred = log_reg_tfidf.predict(X_test_tfidf)
test_time = time.time() - start_test

metrics['Train Accuracy'].append(accuracy_score(y_train, y_train_pred))
metrics['Test Accuracy'].append(accuracy_score(y_test, y_test_pred))
metrics['F1 Score'].append(f1_score(y_test, y_test_pred))
metrics['Precision'].append(precision_score(y_test, y_test_pred))
metrics['Recall'].append(recall_score(y_test, y_test_pred))
metrics['Fit Time (s)'].append(fit_time)
metrics['Test Time (s)'].append(test_time)
metrics['Confusion Matrix'].append(confusion_matrix(y_test, y_test_pred))

start_fit = time.time()
log_reg_lsa.fit(X_train_lsa, y_train)
fit_time = time.time() - start_fit

start_test = time.time()
y_train_pred = log_reg_lsa.predict(X_train_lsa)
y_test_pred = log_reg_lsa.predict(X_test_lsa)
test_time = time.time() - start_test

metrics['Train Accuracy'].append(accuracy_score(y_train, y_train_pred))
metrics['Test Accuracy'].append(accuracy_score(y_test, y_test_pred))
metrics['F1 Score'].append(f1_score(y_test, y_test_pred))
metrics['Precision'].append(precision_score(y_test, y_test_pred))
metrics['Recall'].append(recall_score(y_test, y_test_pred))
metrics['Fit Time (s)'].append(fit_time)
metrics['Test Time (s)'].append(test_time)
metrics['Confusion Matrix'].append(confusion_matrix(y_test, y_test_pred))

import pandas as pd

results_df = pd.DataFrame(metrics)
print(results_df)

print("\nConfusion Matrices:")
print(f"TF-IDF Confusion Matrix:\n{metrics['Confusion Matrix'][0]}")
print(f"LSA Confusion Matrix:\n{metrics['Confusion Matrix'][1]}")

    Model  Train Accuracy  Test Accuracy  F1 Score  Precision    Recall  \
0  TF-IDF        0.939221       0.922762  0.929539   0.930086  0.928992   
1     LSA        0.910499       0.908133  0.915862   0.920032  0.911730   

   Fit Time (s)  Test Time (s)            Confusion Matrix  
0         0.045          0.002  [[1921, 178], [181, 2368]]  
1         0.019          0.002  [[1897, 202], [225, 2324]]  

Confusion Matrices:
TF-IDF Confusion Matrix:
[[1921  178]
 [ 181 2368]]
LSA Confusion Matrix:
[[1897  202]
 [ 225 2324]]


In [6]:
feature_names_tfidf = tfidf.get_feature_names_out()

k = 40

coefficients_tfidf = log_reg_tfidf.coef_[0]
top_k_tfidf_indices = np.argsort(np.abs(coefficients_tfidf))[-k:][::-1]
top_k_tfidf_terms = [(feature_names_tfidf[i], coefficients_tfidf[i]) for i in top_k_tfidf_indices]

print(f"Top {k} coefficients for TF-IDF model:")
for term, coef in top_k_tfidf_terms:
    print(f"{term}: {coef:.4f}")

h = 15

coefficients_lsa = log_reg_lsa.coef_[0]
top_h_lsa_indices = np.argsort(np.abs(coefficients_lsa))[-h:][::-1]
top_h_lsa_terms = [(f"Topic {i}", coefficients_lsa[i]) for i in top_h_lsa_indices]

print(f"\nTop {h} coefficients for LSA model:")
for topic, coef in top_h_lsa_terms:
    print(f"{topic}: {coef:.4f}")

Top 40 coefficients for TF-IDF model:
redflag: 8.2986
kill: 7.5121
film: -6.0625
suicidal: 5.8278
life: 5.7273
die: 5.5285
movie: -5.3161
cannot: 4.5489
anymore: 4.3494
depression: 4.2671
killing: 3.9049
feel: 3.9035
want: 3.8964
pill: 3.5690
end: 3.4158
living: 3.3890
depressed: 3.3631
alive: 3.3393
crush: -3.3278
yall: -3.1375
tried: 3.1137
job: 3.0395
alone: 2.9842
live: 2.9030
worse: 2.8354
family: 2.8202
tired: 2.8084
pain: 2.7970
character: -2.7529
kinda: -2.7526
thought: 2.7406
guy: -2.7318
death: 2.7156
que: 2.6913
point: 2.6599
nothing: 2.6406
cant: 2.5813
hospital: 2.5623
done: 2.5501
everything: 2.5449

Top 15 coefficients for LSA model:
Topic 4: -16.3353
Topic 0: 15.0369
Topic 1: -14.6454
Topic 2: -12.1817
Topic 6: -9.0092
Topic 12: 6.7771
Topic 8: -5.9071
Topic 46: 4.8471
Topic 25: -4.3797
Topic 22: -4.0493
Topic 43: -3.5501
Topic 20: 3.2172
Topic 64: 3.1986
Topic 21: -3.0168
Topic 41: -2.8932


In [7]:
print("LogReg TFIDF Hyperparameters:")
print(log_reg_tfidf.get_params())

print("\nLogReg LSA Hyperparameters:")
print(log_reg_lsa.get_params())

LogReg TFIDF Hyperparameters:
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 5000, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': 42, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}

LogReg LSA Hyperparameters:
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 5000, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': 42, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
