In [21]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier

In [11]:
import pandas as pd

df_train = pd.read_csv('processed_train.csv')

df_test = pd.read_csv('processed_test.csv')

In [12]:
df_train = df_train.drop(columns=["Unnamed: 0"])

df_test = df_test.drop(columns=["Unnamed: 0"])

In [13]:
df_train["label"] = df_train["label"].str.lower()
df_test["label"] = df_test["label"].str.lower()

In [15]:
# Sử dụng Bag of Words (BoW)
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(df_train['text'])
X_test_bow = bow_vectorizer.transform(df_test['text'])

In [17]:
# Huấn luyện và đánh giá mô hình Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_bow, df_train['label'])
y_pred_nb_bow = nb_model.predict(X_test_bow)
nb_accuracy = accuracy_score(df_test['label'], y_pred_nb_bow)
nb_report = classification_report(df_test["label"], y_pred_nb_bow)

nb_accuracy

0.898794989379231

In [18]:
print(nb_report)

                  precision    recall  f1-score   support

chinh tri xa hoi       0.83      0.86      0.85      7567
        doi song       0.69      0.77      0.73      2036
        khoa hoc       0.85      0.71      0.78      2096
      kinh doanh       0.90      0.88      0.89      5276
       phap luat       0.87      0.91      0.89      3788
        suc khoe       0.92      0.93      0.93      5417
        the gioi       0.93      0.91      0.92      6716
        the thao       0.99      0.95      0.97      6667
         van hoa       0.92      0.93      0.93      6250
         vi tinh       0.94      0.92      0.93      4560

        accuracy                           0.90     50373
       macro avg       0.88      0.88      0.88     50373
    weighted avg       0.90      0.90      0.90     50373



In [19]:
# Huấn luyện và đánh giá mô hình Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_bow, df_train['label'])
y_pred_lr_bow = lr_model.predict(X_test_bow)
lr_accuracy = accuracy_score(df_test['label'], y_pred_lr_bow)
lr_report = classification_report(df_test["label"], y_pred_lr_bow)

lr_accuracy

0.9126119151132551

In [20]:
print(lr_report)

                  precision    recall  f1-score   support

chinh tri xa hoi       0.85      0.89      0.87      7567
        doi song       0.77      0.63      0.69      2036
        khoa hoc       0.81      0.82      0.81      2096
      kinh doanh       0.92      0.89      0.91      5276
       phap luat       0.89      0.92      0.90      3788
        suc khoe       0.93      0.93      0.93      5417
        the gioi       0.95      0.93      0.94      6716
        the thao       0.98      0.98      0.98      6667
         van hoa       0.92      0.95      0.94      6250
         vi tinh       0.94      0.95      0.94      4560

        accuracy                           0.91     50373
       macro avg       0.90      0.89      0.89     50373
    weighted avg       0.91      0.91      0.91     50373



In [22]:
# Khởi tạo mô hình Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_bow, df_train['label'])
y_pred_dt_bow = dt_model.predict(X_test_bow)
dt_accuracy = accuracy_score(df_test['label'], y_pred_dt_bow)
dt_report = classification_report(df_test["label"], y_pred_dt_bow)

dt_accuracy

0.729557501042225

In [23]:
print(dt_report)

                  precision    recall  f1-score   support

chinh tri xa hoi       0.61      0.63      0.62      7567
        doi song       0.33      0.34      0.33      2036
        khoa hoc       0.49      0.51      0.50      2096
      kinh doanh       0.71      0.68      0.69      5276
       phap luat       0.71      0.79      0.75      3788
        suc khoe       0.76      0.79      0.78      5417
        the gioi       0.79      0.72      0.76      6716
        the thao       0.92      0.93      0.93      6667
         van hoa       0.82      0.78      0.80      6250
         vi tinh       0.75      0.75      0.75      4560

        accuracy                           0.73     50373
       macro avg       0.69      0.69      0.69     50373
    weighted avg       0.73      0.73      0.73     50373

