In [1]:
# Nhập thư viện
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import time

import warnings
warnings.filterwarnings('ignore')

# Đọc bộ dữ liệu
corpus = pd.read_csv('data/cleaned_mhc.csv')

# Chuẩn bị dữ liệu
X_train, X_test, y_train, y_test = train_test_split(
    corpus['text'],
    corpus['label'],
    test_size=0.2,
    random_state=42
)

tfidf = TfidfVectorizer(max_features=3500)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

n_components = 100

lsa = TruncatedSVD(n_components=n_components, random_state=42)
X_train_lsa = lsa.fit_transform(X_train_tfidf)
X_test_lsa = lsa.transform(X_test_tfidf)

print(f"TF-IDF - Train shape: {X_train_tfidf.shape}, Test Shape: {X_test_tfidf.shape}")
print(f"LSA - Train shape: {X_train_lsa.shape}, Test Shape: {X_test_lsa.shape}")

TF-IDF - Train shape: (18592, 3500), Test Shape: (4648, 3500)
LSA - Train shape: (18592, 100), Test Shape: (4648, 100)


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import time
import pandas as pd

# Customizable parameters for RandomForestClassifier
n_estimators = 50  # Number of trees in the forest
max_depth = None     # Maximum depth of the trees
min_samples_split = 2  # Minimum number of samples required to split an internal node
min_samples_leaf = 1   # Minimum number of samples required to be at a leaf node
max_features = 'log2'  # Number of features to consider for the best split
bootstrap = True       # Whether bootstrap samples are used when building trees
random_state = 42      # For reproducibility

# Initialize the Random Forest Classifiers
rf_tfidf = RandomForestClassifier(n_estimators=n_estimators, 
                                  max_depth=max_depth, 
                                  min_samples_split=min_samples_split, 
                                  min_samples_leaf=min_samples_leaf,
                                  max_features=max_features, 
                                  bootstrap=bootstrap, 
                                  random_state=random_state)

rf_lsa = RandomForestClassifier(n_estimators=n_estimators, 
                                max_depth=max_depth, 
                                min_samples_split=min_samples_split, 
                                min_samples_leaf=min_samples_leaf,
                                max_features=max_features, 
                                bootstrap=bootstrap, 
                                random_state=random_state)

# Metrics dictionary
metrics = {
    'Model': ['TF-IDF', 'LSA'],
    'Train Accuracy': [],
    'Test Accuracy': [],
    'F1 Score': [],
    'Precision': [],
    'Recall': [],
    'Fit Time (s)': [],
    'Test Time (s)': [],
    'Confusion Matrix': []
}

# Train and test the TF-IDF Random Forest Classifier
start_fit = time.time()
rf_tfidf.fit(X_train_tfidf, y_train)
fit_time = time.time() - start_fit

start_test = time.time()
y_train_pred = rf_tfidf.predict(X_train_tfidf)
y_test_pred = rf_tfidf.predict(X_test_tfidf)
test_time = time.time() - start_test

metrics['Train Accuracy'].append(accuracy_score(y_train, y_train_pred))
metrics['Test Accuracy'].append(accuracy_score(y_test, y_test_pred))
metrics['F1 Score'].append(f1_score(y_test, y_test_pred))
metrics['Precision'].append(precision_score(y_test, y_test_pred))
metrics['Recall'].append(recall_score(y_test, y_test_pred))
metrics['Fit Time (s)'].append(fit_time)
metrics['Test Time (s)'].append(test_time)
metrics['Confusion Matrix'].append(confusion_matrix(y_test, y_test_pred))

# Train and test the LSA Random Forest Classifier
start_fit = time.time()
rf_lsa.fit(X_train_lsa, y_train)
fit_time = time.time() - start_fit

start_test = time.time()
y_train_pred = rf_lsa.predict(X_train_lsa)
y_test_pred = rf_lsa.predict(X_test_lsa)
test_time = time.time() - start_test

metrics['Train Accuracy'].append(accuracy_score(y_train, y_train_pred))
metrics['Test Accuracy'].append(accuracy_score(y_test, y_test_pred))
metrics['F1 Score'].append(f1_score(y_test, y_test_pred))
metrics['Precision'].append(precision_score(y_test, y_test_pred))
metrics['Recall'].append(recall_score(y_test, y_test_pred))
metrics['Fit Time (s)'].append(fit_time)
metrics['Test Time (s)'].append(test_time)
metrics['Confusion Matrix'].append(confusion_matrix(y_test, y_test_pred))

# Create a DataFrame to hold the metrics
results_df = pd.DataFrame(metrics)
print(results_df)

# Print Confusion Matrices for each model
print("\nConfusion Matrices:")
print(f"TF-IDF Confusion Matrix:\n{metrics['Confusion Matrix'][0]}")
print(f"LSA Confusion Matrix:\n{metrics['Confusion Matrix'][1]}")

    Model  Train Accuracy  Test Accuracy  F1 Score  Precision    Recall  \
0  TF-IDF        1.000000       0.893072  0.903024   0.898292  0.907807   
1     LSA        0.999946       0.873709  0.888509   0.861193  0.917615   

   Fit Time (s)  Test Time (s)            Confusion Matrix  
0      6.193930       0.311163  [[1837, 262], [235, 2314]]  
1      5.669841       0.143617  [[1722, 377], [210, 2339]]  

Confusion Matrices:
TF-IDF Confusion Matrix:
[[1837  262]
 [ 235 2314]]
LSA Confusion Matrix:
[[1722  377]
 [ 210 2339]]


In [10]:
print("Random Forest TFIDF Hyperparameters:")
print(rf_tfidf.get_params())

print("\nRandom Forest LSA Hyperparameters:")
print(rf_lsa.get_params())

Random Forest TFIDF Hyperparameters:
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'log2', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 50, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}

Random Forest LSA Hyperparameters:
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'log2', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 50, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}
