In [1]:
# Nhập thư viện
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import time

import warnings
warnings.filterwarnings('ignore')

# Đọc bộ dữ liệu
corpus = pd.read_csv('data/cleaned_mhc.csv')

In [2]:
# Chuẩn bị dữ liệu
X_train, X_test, y_train, y_test = train_test_split(
    corpus['text'],
    corpus['label'],
    test_size=0.2,
    random_state=42
)

tfidf = TfidfVectorizer(max_features=3500)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

n_components = 100

lsa = TruncatedSVD(n_components=n_components, random_state=42)
X_train_lsa = lsa.fit_transform(X_train_tfidf)
X_test_lsa = lsa.transform(X_test_tfidf)

print(f"TF-IDF - Train shape: {X_train_tfidf.shape}, Test Shape: {X_test_tfidf.shape}")
print(f"LSA - Train shape: {X_train_lsa.shape}, Test Shape: {X_test_lsa.shape}")

TF-IDF - Train shape: (18592, 3500), Test Shape: (4648, 3500)
LSA - Train shape: (18592, 100), Test Shape: (4648, 100)


In [3]:
k_values = [1, 5, 10, 50, 100, 200, 500, 1000, 2000, 4000]
metrics = ['euclidean', 'cosine']
datasets = {'TF-IDF': (X_train_tfidf, X_test_tfidf), 'LSA': (X_train_lsa, X_test_lsa)}

results = pd.DataFrame(columns=['Dataset', 'Distance', 'K', 'Train Accuracy', 'Test Accuracy',
                                'F1 Score', 'Precision', 'Recall', 'Test Time (s)'])

for dataset_name, (X_train, X_test) in datasets.items():
    for metric in metrics:
        for k in k_values:
            knn = KNeighborsClassifier(n_neighbors=k, metric=metric)
            
            knn.fit(X_train, y_train)
            start_time = time.time()
            y_pred = knn.predict(X_test)
            test_time = time.time() - start_time
            
            train_accuracy = knn.score(X_train, y_train)
            test_accuracy = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average='weighted')
            precision = precision_score(y_test, y_pred, average='weighted')
            recall = recall_score(y_test, y_pred, average='weighted')
            
            results.loc[len(results)] = [dataset_name, metric, k, train_accuracy, test_accuracy, 
                                         f1, precision, recall, test_time]
            
            print(f"Finished KNN for {dataset_name} with {metric} distance at K={k}")

best_models = results.loc[results.groupby(['Dataset', 'Distance'])['F1 Score'].idxmax()]

for _, row in best_models.iterrows():
    best_knn = KNeighborsClassifier(n_neighbors=int(row['K']), metric=row['Distance'])
    best_knn.fit(datasets[row['Dataset']][0], y_train)
    y_best_pred = best_knn.predict(datasets[row['Dataset']][1])
    confusion = confusion_matrix(y_test, y_best_pred)
    
    print(f"\nBest model for {row['Dataset']} with {row['Distance']} distance and K={row['K']}:")
    print(f"Train Accuracy: {row['Train Accuracy']:.4f}, Test Accuracy: {row['Test Accuracy']:.4f}")
    print(f"F1 Score: {row['F1 Score']:.4f}, Precision: {row['Precision']:.4f}, Recall: {row['Recall']:.4f}")
    print(f"Test Time: {row['Test Time (s)']:.4f} seconds")
    print("Confusion Matrix:\n", confusion)

Finished KNN for TF-IDF with euclidean distance at K=1
Finished KNN for TF-IDF with euclidean distance at K=5
Finished KNN for TF-IDF with euclidean distance at K=10
Finished KNN for TF-IDF with euclidean distance at K=50
Finished KNN for TF-IDF with euclidean distance at K=100
Finished KNN for TF-IDF with euclidean distance at K=200
Finished KNN for TF-IDF with euclidean distance at K=500
Finished KNN for TF-IDF with euclidean distance at K=1000
Finished KNN for TF-IDF with euclidean distance at K=2000
Finished KNN for TF-IDF with euclidean distance at K=4000
Finished KNN for TF-IDF with cosine distance at K=1
Finished KNN for TF-IDF with cosine distance at K=5
Finished KNN for TF-IDF with cosine distance at K=10
Finished KNN for TF-IDF with cosine distance at K=50
Finished KNN for TF-IDF with cosine distance at K=100
Finished KNN for TF-IDF with cosine distance at K=200
Finished KNN for TF-IDF with cosine distance at K=500
Finished KNN for TF-IDF with cosine distance at K=1000
Finish

$
\begin{bmatrix}
\text{True Negative (TN) - Class 0} & \text{False Positive (FP) - Predict 1 but 0} \\
\text{False Negative (FN) - Predict 0 but 1} & \text{True Positive (TP) - Class 1} \\
\end{bmatrix}
$

In [4]:
tfidf_table = results[results["Dataset"] == "TF-IDF"].pivot(index="K", columns="Distance", values=[
    "Train Accuracy", "Test Accuracy", "F1 Score", "Precision", "Recall", "Test Time (s)"
])

lsa_table = results[results["Dataset"] == "LSA"].pivot(index="K", columns="Distance", values=[
    "Train Accuracy", "Test Accuracy", "F1 Score", "Precision", "Recall", "Test Time (s)"
])

def apply_dual_heatmap(styler):
    euclidean_columns = [col for col in styler.columns if col[1] == 'euclidean']
    styler = styler.background_gradient(subset=euclidean_columns, cmap='Reds')
    
    cosine_columns = [col for col in styler.columns if col[1] == 'cosine']
    styler = styler.background_gradient(subset=cosine_columns, cmap='Blues')
    
    return styler

tfidf_table_styled = tfidf_table.style.pipe(apply_dual_heatmap)
lsa_table_styled = lsa_table.style.pipe(apply_dual_heatmap)

print("TF-IDF Table with Dual Heatmap:")
display(tfidf_table_styled)

print("\nLSA Table with Dual Heatmap:")
display(lsa_table_styled)

TF-IDF Table with Dual Heatmap:


Unnamed: 0_level_0,Train Accuracy,Train Accuracy,Test Accuracy,Test Accuracy,F1 Score,F1 Score,Precision,Precision,Recall,Recall,Test Time (s),Test Time (s)
Distance,cosine,euclidean,cosine,euclidean,cosine,euclidean,cosine,euclidean,cosine,euclidean,cosine,euclidean
K,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,1.0,1.0,0.800559,0.459122,0.801027,0.304073,0.805585,0.610986,0.800559,0.459122,2.120452,2.219282
5,0.900602,0.807336,0.868546,0.748709,0.868586,0.74298,0.868644,0.806777,0.868546,0.748709,2.657799,2.732161
10,0.892265,0.869083,0.873064,0.842513,0.873174,0.842557,0.873455,0.857553,0.873064,0.842513,2.705641,2.733357
50,0.884359,0.8892,0.88253,0.888339,0.881128,0.887388,0.888725,0.891984,0.88253,0.888339,2.644774,2.743193
100,0.865372,0.870321,0.864028,0.86833,0.860789,0.86557,0.880447,0.88218,0.864028,0.86833,2.674501,2.770694
200,0.830142,0.832993,0.830895,0.835628,0.823788,0.829104,0.863315,0.866114,0.830895,0.835628,2.700744,2.800691
500,0.763662,0.764899,0.768503,0.770654,0.74953,0.752188,0.833144,0.834264,0.768503,0.770654,2.874855,2.88868
1000,0.707025,0.707778,0.708477,0.708692,0.670073,0.670374,0.807429,0.807524,0.708477,0.708692,3.105952,3.045009
2000,0.655927,0.656411,0.660929,0.661145,0.599229,0.599568,0.789646,0.78973,0.660929,0.661145,3.457414,3.355456
4000,0.596708,0.596762,0.604131,0.604131,0.502171,0.502171,0.770091,0.770091,0.604131,0.604131,4.129323,4.020889



LSA Table with Dual Heatmap:


Unnamed: 0_level_0,Train Accuracy,Train Accuracy,Test Accuracy,Test Accuracy,F1 Score,F1 Score,Precision,Precision,Recall,Recall,Test Time (s),Test Time (s)
Distance,cosine,euclidean,cosine,euclidean,cosine,euclidean,cosine,euclidean,cosine,euclidean,cosine,euclidean
K,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,1.0,1.0,0.823795,0.763769,0.823664,0.762289,0.823616,0.79222,0.823795,0.763769,0.600222,0.360929
5,0.899796,0.845633,0.859079,0.789802,0.858507,0.787894,0.85964,0.825641,0.859079,0.789802,1.170573,0.286204
10,0.890544,0.799753,0.865534,0.763339,0.865293,0.758172,0.865484,0.82168,0.865534,0.763339,1.126923,0.307014
50,0.865103,0.767534,0.85994,0.755594,0.858463,0.749101,0.864306,0.821694,0.85994,0.755594,1.137395,0.347096
100,0.856497,0.749408,0.850688,0.74247,0.848109,0.734058,0.86005,0.817499,0.850688,0.74247,1.169974,0.379806
200,0.841168,0.726334,0.83864,0.71988,0.834333,0.707682,0.85582,0.809401,0.83864,0.71988,1.202811,0.484833
500,0.813845,0.689221,0.812608,0.686317,0.803718,0.666536,0.84808,0.799599,0.812608,0.686317,1.299135,0.786733
1000,0.777861,0.653991,0.78012,0.649527,0.764248,0.618993,0.836615,0.787887,0.78012,0.649527,1.452899,1.174801
2000,0.735316,0.624731,0.737737,0.61747,0.710367,0.574663,0.817644,0.778938,0.737737,0.61747,1.787547,1.58897
4000,0.687715,0.606175,0.689974,0.603485,0.643649,0.553581,0.799382,0.779225,0.689974,0.603485,2.472778,2.256045
