In [1]:
# Nhập thư viện
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import time

import warnings
warnings.filterwarnings('ignore')

# Đọc bộ dữ liệu
corpus = pd.read_csv('data/cleaned_mhc.csv')

In [2]:
# Chuẩn bị dữ liệu
X_train, X_test, y_train, y_test = train_test_split(
    corpus['text'],
    corpus['label'],
    test_size=0.2,
    random_state=42
)

tfidf = TfidfVectorizer(max_features=3500)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

n_components = 100

lsa = TruncatedSVD(n_components=n_components, random_state=42)
X_train_lsa = lsa.fit_transform(X_train_tfidf)
X_test_lsa = lsa.transform(X_test_tfidf)

print(f"TF-IDF - Train shape: {X_train_tfidf.shape}, Test Shape: {X_test_tfidf.shape}")
print(f"LSA - Train shape: {X_train_lsa.shape}, Test Shape: {X_test_lsa.shape}")

import pandas as pd

print("Class distribution in y_train:")
print(pd.Series(y_train).value_counts())

print("\nClass distribution in y_test:")
print(pd.Series(y_test).value_counts())

TF-IDF - Train shape: (18592, 3500), Test Shape: (4648, 3500)
LSA - Train shape: (18592, 100), Test Shape: (4648, 100)
Class distribution in y_train:
label
1    10099
0     8493
Name: count, dtype: int64

Class distribution in y_test:
label
1    2549
0    2099
Name: count, dtype: int64


In [3]:
k_values = [1, 5, 10, 50, 100, 200, 500, 1000, 2000, 4000]
metrics = ['euclidean', 'cosine']
datasets = {'TF-IDF': (X_train_tfidf, X_test_tfidf), 'LSA': (X_train_lsa, X_test_lsa)}

results = pd.DataFrame(columns=['Dataset', 'Distance', 'K', 'Train Accuracy', 'Test Accuracy',
                                'F1 Score', 'Precision', 'Recall', 'Test Time (s)'])

for dataset_name, (X_train, X_test) in datasets.items():
    for metric in metrics:
        for k in k_values:
            knn = KNeighborsClassifier(n_neighbors=k, metric=metric)
            
            knn.fit(X_train, y_train)
            start_time = time.time()
            y_pred = knn.predict(X_test)
            test_time = time.time() - start_time
            
            train_accuracy = knn.score(X_train, y_train)
            test_accuracy = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average='weighted')
            precision = precision_score(y_test, y_pred, average='weighted')
            recall = recall_score(y_test, y_pred, average='weighted')
            
            results.loc[len(results)] = [dataset_name, metric, k, train_accuracy, test_accuracy, 
                                         f1, precision, recall, test_time]
            
            print(f"Finished KNN for {dataset_name} with {metric} distance at K={k}")

best_models = results.loc[results.groupby(['Dataset', 'Distance'])['F1 Score'].idxmax()]

for _, row in best_models.iterrows():
    best_knn = KNeighborsClassifier(n_neighbors=int(row['K']), metric=row['Distance'])
    best_knn.fit(datasets[row['Dataset']][0], y_train)
    y_best_pred = best_knn.predict(datasets[row['Dataset']][1])
    confusion = confusion_matrix(y_test, y_best_pred)
    
    print(f"\nBest model for {row['Dataset']} with {row['Distance']} distance and K={row['K']}:")
    print(f"Train Accuracy: {row['Train Accuracy']:.4f}, Test Accuracy: {row['Test Accuracy']:.4f}")
    print(f"F1 Score: {row['F1 Score']:.4f}, Precision: {row['Precision']:.4f}, Recall: {row['Recall']:.4f}")
    print(f"Test Time: {row['Test Time (s)']:.4f} seconds")
    print("Confusion Matrix:\n", confusion)

Finished KNN for TF-IDF with euclidean distance at K=1
Finished KNN for TF-IDF with euclidean distance at K=5
Finished KNN for TF-IDF with euclidean distance at K=10
Finished KNN for TF-IDF with euclidean distance at K=50
Finished KNN for TF-IDF with euclidean distance at K=100
Finished KNN for TF-IDF with euclidean distance at K=200
Finished KNN for TF-IDF with euclidean distance at K=500
Finished KNN for TF-IDF with euclidean distance at K=1000
Finished KNN for TF-IDF with euclidean distance at K=2000
Finished KNN for TF-IDF with euclidean distance at K=4000
Finished KNN for TF-IDF with cosine distance at K=1
Finished KNN for TF-IDF with cosine distance at K=5
Finished KNN for TF-IDF with cosine distance at K=10
Finished KNN for TF-IDF with cosine distance at K=50
Finished KNN for TF-IDF with cosine distance at K=100
Finished KNN for TF-IDF with cosine distance at K=200
Finished KNN for TF-IDF with cosine distance at K=500
Finished KNN for TF-IDF with cosine distance at K=1000
Finish

$
\begin{bmatrix}
\text{True Negative (TN) - Class 0} & \text{False Positive (FP) - Predict 1 but 0} \\
\text{False Negative (FN) - Predict 0 but 1} & \text{True Positive (TP) - Class 1} \\
\end{bmatrix}
$

In [4]:
# Pivot tables for TF-IDF and LSA
tfidf_table = results[results["Dataset"] == "TF-IDF"].pivot(index="K", columns="Distance", values=[
    "Train Accuracy", "Test Accuracy", "F1 Score", "Precision", "Recall", "Test Time (s)"
])

lsa_table = results[results["Dataset"] == "LSA"].pivot(index="K", columns="Distance", values=[
    "Train Accuracy", "Test Accuracy", "F1 Score", "Precision", "Recall", "Test Time (s)"
])

# Function to apply dual heatmap styling
def apply_dual_heatmap(styler):
    # Select columns for euclidean and cosine distances
    euclidean_columns = [col for col in styler.columns if col[1] == 'euclidean']
    styler = styler.background_gradient(subset=euclidean_columns, cmap='Reds')
    
    cosine_columns = [col for col in styler.columns if col[1] == 'cosine']
    styler = styler.background_gradient(subset=cosine_columns, cmap='Blues')
    
    return styler

# Apply styling and formatting with 3 decimal points
tfidf_table_styled = tfidf_table.style.pipe(apply_dual_heatmap).format(precision=3)
lsa_table_styled = lsa_table.style.pipe(apply_dual_heatmap).format(precision=3)

# Display the tables with dual heatmap
print("TF-IDF Table with Dual Heatmap:")
display(tfidf_table_styled)

print("\nLSA Table with Dual Heatmap:")
display(lsa_table_styled)

TF-IDF Table with Dual Heatmap:


Unnamed: 0_level_0,Train Accuracy,Train Accuracy,Test Accuracy,Test Accuracy,F1 Score,F1 Score,Precision,Precision,Recall,Recall,Test Time (s),Test Time (s)
Distance,cosine,euclidean,cosine,euclidean,cosine,euclidean,cosine,euclidean,cosine,euclidean,cosine,euclidean
K,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,1.0,1.0,0.801,0.459,0.801,0.304,0.806,0.611,0.801,0.459,2.193,2.282
5,0.901,0.807,0.869,0.749,0.869,0.743,0.869,0.807,0.869,0.749,2.767,2.854
10,0.892,0.869,0.873,0.843,0.873,0.843,0.873,0.858,0.873,0.843,2.74,2.818
50,0.884,0.889,0.883,0.888,0.881,0.887,0.889,0.892,0.883,0.888,2.723,2.867
100,0.865,0.87,0.864,0.868,0.861,0.866,0.88,0.882,0.864,0.868,2.697,2.828
200,0.83,0.833,0.831,0.836,0.824,0.829,0.863,0.866,0.831,0.836,2.708,2.862
500,0.764,0.765,0.769,0.771,0.75,0.752,0.833,0.834,0.769,0.771,2.805,2.96
1000,0.707,0.708,0.708,0.709,0.67,0.67,0.807,0.808,0.708,0.709,3.054,3.73
2000,0.656,0.656,0.661,0.661,0.599,0.6,0.79,0.79,0.661,0.661,3.441,4.781
4000,0.597,0.597,0.604,0.604,0.502,0.502,0.77,0.77,0.604,0.604,4.233,4.159



LSA Table with Dual Heatmap:


Unnamed: 0_level_0,Train Accuracy,Train Accuracy,Test Accuracy,Test Accuracy,F1 Score,F1 Score,Precision,Precision,Recall,Recall,Test Time (s),Test Time (s)
Distance,cosine,euclidean,cosine,euclidean,cosine,euclidean,cosine,euclidean,cosine,euclidean,cosine,euclidean
K,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,1.0,1.0,0.824,0.764,0.824,0.762,0.824,0.792,0.824,0.764,0.571,0.359
5,0.9,0.846,0.859,0.79,0.859,0.788,0.86,0.826,0.859,0.79,1.106,0.3
10,0.891,0.8,0.866,0.763,0.865,0.758,0.865,0.822,0.866,0.763,1.191,0.295
50,0.865,0.768,0.86,0.756,0.858,0.749,0.864,0.822,0.86,0.756,1.095,0.337
100,0.856,0.749,0.851,0.742,0.848,0.734,0.86,0.817,0.851,0.742,1.143,0.374
200,0.841,0.726,0.839,0.72,0.834,0.708,0.856,0.809,0.839,0.72,1.157,0.473
500,0.814,0.689,0.813,0.686,0.804,0.667,0.848,0.8,0.813,0.686,1.227,0.808
1000,0.778,0.654,0.78,0.65,0.764,0.619,0.837,0.788,0.78,0.65,1.386,1.177
2000,0.735,0.625,0.738,0.617,0.71,0.575,0.818,0.779,0.738,0.617,1.677,1.571
4000,0.688,0.606,0.69,0.603,0.644,0.554,0.799,0.779,0.69,0.603,2.462,2.218
