In [72]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score

In [73]:
# Đọc data từ file
df = pd.read_csv('/content/text_classification_train_data.csv')

In [74]:
# Tiền xử lý data
def preprocess_text(text):
    text = text.lower()
    text = ''.join(e for e in text if e.isalnum() or e.isspace())
    return text

data['sentence'] = data['sentence'].apply(preprocess_text)

In [75]:
# Tách data
X = data['sentence']
y_topic = data['topic']
y_sentiment = data['sentiment']

X_train, X_test, y_train_topic, y_test_topic = train_test_split(X, y_topic, test_size=0.2, random_state=42)

In [76]:
# Chuyển đổi văn bản thành vector TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [77]:
# Train topic
nb_topic = MultinomialNB()
nb_topic.fit(X_train_tfidf, y_train_topic)

logreg_topic = LogisticRegression(max_iter=200)
logreg_topic.fit(X_train_tfidf, y_train_topic)

svm_topic = SVC(kernel='linear')
svm_topic.fit(X_train_tfidf, y_train_topic)

# Train sentiment
nb_sentiment = MultinomialNB()
nb_sentiment.fit(X_train_tfidf, y_train_sentiment)

logreg_sentiment = LogisticRegression(max_iter=200)
logreg_sentiment.fit(X_train_tfidf, y_train_sentiment)

svm_sentiment = SVC(kernel='linear')
svm_sentiment.fit(X_train_tfidf, y_train_sentiment)

models_topic = {
    'Naive Bayes (Topic)': nb_topic,
    'Logistic Regression (Topic)': logreg_topic,
    'SVM (Topic)': svm_topic
}

models_sentiment = {
    'Naive Bayes (Sentiment)': nb_sentiment,
    'Logistic Regression (Sentiment)': logreg_sentiment,
    'SVM (Sentiment)': svm_sentiment
}


In [85]:
accuracies_topic = {}
accuracies_sentiment = {}


# Đánh giá(precision, recall, và F1-score) và lưu trữ độ chính xác
for name, model in models_topic.items():
    y_pred_topic = model.predict(X_test_tfidf)
    print(f"Classification Report (Topic) - {name}")
    print(classification_report(y_test_topic, y_pred_topic))
    accuracies_topic[name] = accuracy_score(y_test_topic, y_pred_topic)

for name, model in models_sentiment.items():
    y_pred_sentiment = model.predict(X_test_tfidf)
    print(f"Classification Report (Sentiment) - {name}")
    print(classification_report(y_test_sentiment, y_pred_sentiment))
    accuracies_sentiment[name] = accuracy_score(y_test_sentiment, y_pred_sentiment)

Classification Report (Topic) - Naive Bayes (Topic)
              precision    recall  f1-score   support

           0       0.81      0.98      0.89      1471
           1       0.68      0.39      0.50       379
           2       0.98      0.57      0.72       109
           3       0.00      0.00      0.00        98

    accuracy                           0.80      2057
   macro avg       0.62      0.48      0.53      2057
weighted avg       0.76      0.80      0.76      2057

Classification Report (Topic) - Logistic Regression (Topic)
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1471
           1       0.72      0.63      0.67       379
           2       0.97      0.79      0.87       109
           3       0.70      0.29      0.41        98

    accuracy                           0.86      2057
   macro avg       0.82      0.67      0.72      2057
weighted avg       0.85      0.86      0.85      2057



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report (Topic) - SVM (Topic)
              precision    recall  f1-score   support

           0       0.89      0.95      0.92      1471
           1       0.72      0.64      0.68       379
           2       0.95      0.86      0.90       109
           3       0.74      0.33      0.45        98

    accuracy                           0.86      2057
   macro avg       0.82      0.70      0.74      2057
weighted avg       0.85      0.86      0.85      2057

Classification Report (Sentiment) - Naive Bayes (Sentiment)
              precision    recall  f1-score   support

           0       0.81      0.91      0.86       974
           1       0.00      0.00      0.00        82
           2       0.89      0.85      0.87      1001

    accuracy                           0.85      2057
   macro avg       0.57      0.59      0.58      2057
weighted avg       0.81      0.85      0.83      2057

Classification Report (Sentiment) - Logistic Regression (Sentiment)
            

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report (Sentiment) - SVM (Sentiment)
              precision    recall  f1-score   support

           0       0.89      0.94      0.91       974
           1       0.65      0.13      0.22        82
           2       0.92      0.93      0.92      1001

    accuracy                           0.90      2057
   macro avg       0.82      0.67      0.69      2057
weighted avg       0.89      0.90      0.89      2057



In [84]:
# Tìm mô hình tốt nhất cho topic
best_model_topic = max(accuracies_topic, key=accuracies_topic.get)
print(f"Best model for topic: {best_model_topic}")

# Tìm mô hình tốt nhất cho sentiment
best_model_sentiment = max(accuracies_sentiment, key=accuracies_sentiment.get)
print(f"Best model for sentiment: {best_model_sentiment}")

Best model for topic: SVM (Topic)
Best model for sentiment: Logistic Regression (Sentiment)


In [82]:
def predict_sentence(sentence, vectorizer, best_model_topic, best_model_sentiment, models_topic, models_sentiment):
    sentence = preprocess_text(sentence)
    sentence_tfidf = vectorizer.transform([sentence])

    model_topic = models_topic[best_model_topic]
    model_sentiment = models_sentiment[best_model_sentiment]

    topic_prediction = model_topic.predict(sentence_tfidf)
    sentiment_prediction = model_sentiment.predict(sentence_tfidf)

    return topic_prediction[0], sentiment_prediction[0]

# Test
input_sentence = 'ckhông có gì làm em không hài lòng cả .'
predicted_topic, predicted_sentiment = predict_sentence(input_sentence, vectorizer, best_model_topic, best_model_sentiment, models_topic, models_sentiment)

# In kết quả
print(f"Sentence: {input_sentence}")
print(f"Predicted Topic: {predicted_topic}")
print(f"Predicted Sentiment: {predicted_sentiment}")


Sentence: ckhông có gì làm em không hài lòng cả .
Predicted Topic: 3
Predicted Sentiment: 1
