In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

file_path = 'G:\projectMachineLearningTest\data\cleaned_mhc.csv'
data = pd.read_csv(file_path)

tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_data = tfidf_vectorizer.fit_transform(data['text'])

y = data['label']
X_train, X_test, y_train, y_test = train_test_split(tfidf_data, y, test_size=0.2, random_state=42)

### 1. Random Forest với các cấu hình khác nhau ###
print("### Random Forest ###")

rf_params_variants = [
    {'n_estimators': 10, 'max_depth': 5, 'max_features': 10, 'min_samples_split': 5, 'min_samples_leaf': 2},
    {'n_estimators': 50, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'min_samples_leaf': 1},
    {'n_estimators': 100, 'max_depth': 10, 'max_features': 'log2', 'min_samples_split': 10, 'min_samples_leaf': 4},
    {'n_estimators': 200, 'max_depth': None, 'max_features': None, 'min_samples_split': 2, 'min_samples_leaf': 1}
]

for params in rf_params_variants:
    rf_model = RandomForestClassifier(**params, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    print(f"\nCấu hình: {params}")
    print(classification_report(y_test, y_pred))

### Random Forest ###

Cấu hình: {'n_estimators': 10, 'max_depth': 5, 'max_features': 10, 'min_samples_split': 5, 'min_samples_leaf': 2}
              precision    recall  f1-score   support

           0       0.80      0.72      0.76      2099
           1       0.79      0.85      0.82      2549

    accuracy                           0.79      4648
   macro avg       0.79      0.79      0.79      4648
weighted avg       0.79      0.79      0.79      4648


Cấu hình: {'n_estimators': 50, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'min_samples_leaf': 1}
              precision    recall  f1-score   support

           0       0.89      0.85      0.87      2099
           1       0.88      0.91      0.90      2549

    accuracy                           0.89      4648
   macro avg       0.89      0.88      0.88      4648
weighted avg       0.89      0.89      0.89      4648


Cấu hình: {'n_estimators': 100, 'max_depth': 10, 'max_features': 'log2', 'min_samples_

In [4]:
### 2. SVM với các kernel và cấu hình khác nhau ###
print("\n### SVM ###")

svm_params_variants = [
    {'C': 0.1, 'kernel': 'linear'},
    {'C': 1, 'kernel': 'linear'},
    {'C': 1, 'kernel': 'poly', 'degree': 2},
    {'C': 10, 'kernel': 'poly', 'degree': 3},
    {'C': 1, 'kernel': 'rbf', 'gamma': 1},
    {'C': 10, 'kernel': 'rbf', 'gamma': 0.1},
    {'C': 1, 'kernel': 'sigmoid'}
]

for params in svm_params_variants:
    svm_model = SVC(**params, random_state=42)
    svm_model.fit(X_train, y_train)
    y_pred = svm_model.predict(X_test)
    print(f"\nCấu hình: {params}")
    print(classification_report(y_test, y_pred))


### SVM ###

Cấu hình: {'C': 0.1, 'kernel': 'linear'}
              precision    recall  f1-score   support

           0       0.90      0.91      0.90      2099
           1       0.92      0.91      0.92      2549

    accuracy                           0.91      4648
   macro avg       0.91      0.91      0.91      4648
weighted avg       0.91      0.91      0.91      4648


Cấu hình: {'C': 1, 'kernel': 'linear'}
              precision    recall  f1-score   support

           0       0.91      0.91      0.91      2099
           1       0.92      0.92      0.92      2549

    accuracy                           0.92      4648
   macro avg       0.91      0.91      0.91      4648
weighted avg       0.92      0.92      0.92      4648


Cấu hình: {'C': 1, 'kernel': 'poly', 'degree': 2}
              precision    recall  f1-score   support

           0       0.90      0.92      0.91      2099
           1       0.93      0.91      0.92      2549

    accuracy                        