In [1]:
# Nhập thư viện
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import time

import warnings
warnings.filterwarnings('ignore')

# Đọc bộ dữ liệu
corpus = pd.read_csv('data/cleaned_mhc.csv')

# Chuẩn bị dữ liệu
X_train, X_test, y_train, y_test = train_test_split(
    corpus['text'],
    corpus['label'],
    test_size=0.2,
    random_state=42
)

tfidf = TfidfVectorizer(max_features=3500)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

n_components = 100

lsa = TruncatedSVD(n_components=n_components, random_state=42)
X_train_lsa = lsa.fit_transform(X_train_tfidf)
X_test_lsa = lsa.transform(X_test_tfidf)

print(f"TF-IDF - Train shape: {X_train_tfidf.shape}, Test Shape: {X_test_tfidf.shape}")
print(f"LSA - Train shape: {X_train_lsa.shape}, Test Shape: {X_test_lsa.shape}")

TF-IDF - Train shape: (18592, 3500), Test Shape: (4648, 3500)
LSA - Train shape: (18592, 100), Test Shape: (4648, 100)


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import time
import pandas as pd

# Customizable parameters for RandomForestClassifier
n_estimators = 75  # Number of trees in the forest
max_depth = 20     # Maximum depth of the trees
min_samples_split = 5  # Minimum number of samples required to split an internal node
min_samples_leaf = 10   # Minimum number of samples required to be at a leaf node
max_features = 'log2'  # Number of features to consider for the best split
bootstrap = True       # Whether bootstrap samples are used when building trees
random_state = 42      # For reproducibility

# Initialize the Random Forest Classifiers
rf_tfidf = RandomForestClassifier(n_estimators=n_estimators, 
                                  max_depth=max_depth, 
                                  min_samples_split=min_samples_split, 
                                  min_samples_leaf=min_samples_leaf,
                                  max_features=max_features, 
                                  bootstrap=bootstrap, 
                                  random_state=random_state)

rf_lsa = RandomForestClassifier(n_estimators=n_estimators, 
                                max_depth=max_depth, 
                                min_samples_split=min_samples_split, 
                                min_samples_leaf=min_samples_leaf,
                                max_features=max_features, 
                                bootstrap=bootstrap, 
                                random_state=random_state)

# Metrics dictionary
metrics = {
    'Model': ['TF-IDF', 'LSA'],
    'Train Accuracy': [],
    'Test Accuracy': [],
    'F1 Score': [],
    'Precision': [],
    'Recall': [],
    'Fit Time (s)': [],
    'Test Time (s)': [],
    'Confusion Matrix': []
}

# Train and test the TF-IDF Random Forest Classifier
start_fit = time.time()
rf_tfidf.fit(X_train_tfidf, y_train)
fit_time = time.time() - start_fit

start_test = time.time()
y_train_pred = rf_tfidf.predict(X_train_tfidf)
y_test_pred = rf_tfidf.predict(X_test_tfidf)
test_time = time.time() - start_test

metrics['Train Accuracy'].append(accuracy_score(y_train, y_train_pred))
metrics['Test Accuracy'].append(accuracy_score(y_test, y_test_pred))
metrics['F1 Score'].append(f1_score(y_test, y_test_pred))
metrics['Precision'].append(precision_score(y_test, y_test_pred))
metrics['Recall'].append(recall_score(y_test, y_test_pred))
metrics['Fit Time (s)'].append(fit_time)
metrics['Test Time (s)'].append(test_time)
metrics['Confusion Matrix'].append(confusion_matrix(y_test, y_test_pred))

# Train and test the LSA Random Forest Classifier
start_fit = time.time()
rf_lsa.fit(X_train_lsa, y_train)
fit_time = time.time() - start_fit

start_test = time.time()
y_train_pred = rf_lsa.predict(X_train_lsa)
y_test_pred = rf_lsa.predict(X_test_lsa)
test_time = time.time() - start_test

metrics['Train Accuracy'].append(accuracy_score(y_train, y_train_pred))
metrics['Test Accuracy'].append(accuracy_score(y_test, y_test_pred))
metrics['F1 Score'].append(f1_score(y_test, y_test_pred))
metrics['Precision'].append(precision_score(y_test, y_test_pred))
metrics['Recall'].append(recall_score(y_test, y_test_pred))
metrics['Fit Time (s)'].append(fit_time)
metrics['Test Time (s)'].append(test_time)
metrics['Confusion Matrix'].append(confusion_matrix(y_test, y_test_pred))

# Create a DataFrame to hold the metrics
results_df = pd.DataFrame(metrics)
print(results_df)

# Print Confusion Matrices for each model
print("\nConfusion Matrices:")
print(f"TF-IDF Confusion Matrix:\n{metrics['Confusion Matrix'][0]}")
print(f"LSA Confusion Matrix:\n{metrics['Confusion Matrix'][1]}")

    Model  Train Accuracy  Test Accuracy  F1 Score  Precision    Recall  \
0  TF-IDF        0.860747       0.853055  0.865525   0.868775  0.862299   
1     LSA        0.940942       0.874139  0.889518   0.857611  0.923892   

   Fit Time (s)  Test Time (s)            Confusion Matrix  
0      0.499524       0.178442  [[1767, 332], [351, 2198]]  
1      6.936599       0.173207  [[1708, 391], [194, 2355]]  

Confusion Matrices:
TF-IDF Confusion Matrix:
[[1767  332]
 [ 351 2198]]
LSA Confusion Matrix:
[[1708  391]
 [ 194 2355]]


In [3]:
# Get feature importances from the trained Random Forest model
feature_importances = rf_tfidf.feature_importances_

# Get the feature names (terms) from the TF-IDF vectorizer
feature_names = tfidf.get_feature_names_out()

# Create a DataFrame to map feature names to their importances
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Sort the features by importance in descending order and get the top 15
top_15_features = feature_importance_df.sort_values(by='Importance', ascending=False).head(15)

# Print the top 15 features with their importance
print("\nTop 15 Important Features (Terms) for TF-IDF Random Forest Model:")
print(top_15_features)


Top 15 Important Features (Terms) for TF-IDF Random Forest Model:
       Feature  Importance
1559        im    0.025360
3355      want    0.021436
1802      life    0.021300
2023     movie    0.020733
1154      feel    0.020419
2490   redflag    0.020059
843        die    0.016632
450       cant    0.014736
1325     going    0.012514
1155   feeling    0.011028
2996  suicidal    0.010752
101      alone    0.010522
2175      pain    0.009305
1046      even    0.009222
1454      help    0.009190


In [4]:
print("Random Forest TFIDF Hyperparameters:")
print(rf_tfidf.get_params())

print("\nRandom Forest LSA Hyperparameters:")
print(rf_lsa.get_params())

Random Forest TFIDF Hyperparameters:
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 20, 'max_features': 'log2', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 10, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 75, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}

Random Forest LSA Hyperparameters:
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 20, 'max_features': 'log2', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 10, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 75, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}


In [5]:
# import matplotlib.pyplot as plt
# from sklearn.tree import plot_tree

# tfidf_feature_names = tfidf.get_feature_names_out()  # Get feature names from the fitted TF-IDF vectorizer
# lsa_feature_names = [f"Topic {i+1}" for i in range(X_train_lsa.shape[1])]  # Placeholder names for LSA topics

# tree_id = 49

# plt.figure(figsize=(20, 10))
# plot_tree(rf_tfidf.estimators_[tree_id], filled=True, feature_names=tfidf_feature_names, class_names=['0', '1'])
# plt.title(f"TF-IDF Random Forest - Tree {tree_id}")
# plt.show()

# plt.figure(figsize=(20, 10))
# plot_tree(rf_lsa.estimators_[tree_id], filled=True, feature_names=lsa_feature_names, class_names=['0', '1'])
# plt.title(f"LSA Random Forest - Tree {tree_id}")
# plt.show()