In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
df = pd.read_csv('./all/formattedCsv_duy 1-100_13102024.csv')
print(df.head().to_markdown(index=False, numalign="left", stralign="left"))
print(df.info())    

| id   | Review                                                                       | Sentence Component                                                         | aspect_text                        | aspect              | sentiment_text                                          | sentiment   |
|:-----|:-----------------------------------------------------------------------------|:---------------------------------------------------------------------------|:-----------------------------------|:--------------------|:--------------------------------------------------------|:------------|
| 1    | slide giáo trình đầy đủ .                                                    | slide giáo trình đầy đủ                                                    | slide giáo trình                   | Course information  | đầy đủ                                                  | Positive    |
| 2    | nhiệt tình giảng dạy , gần gũi với sinh viên .                               | nhiệt tình giảng dạy , g

In [3]:
# Get all unique values from `aspect`
unique_aspect_values = df['aspect'].unique()

# Check the number of unique values in `aspect`
if len(unique_aspect_values) > 50:
  # If there are too many unique values, sample the top 50
  top_occurring_aspect_values = df['aspect'].value_counts().head(50).index.tolist()
  print(top_occurring_aspect_values)
else:
  # Otherwise print all unique valus in `aspect`
  print(unique_aspect_values)

# Get all unique values from `sentiment`
unique_sentiment_values = df['sentiment'].unique()

# Check the number of unique values in `sentiment`
if len(unique_sentiment_values) > 50:
  # If there are too many unique values, sample the top 50
  top_occurring_sentiment_values = df['sentiment'].value_counts().head(50).index.tolist()
  print(top_occurring_sentiment_values)
else:
  # Otherwise print all unique valus in `sentiment`
  print(unique_sentiment_values)

['Course information' 'Teaching quality' 'Test and evaluation'
 'Support from lecturers' 'General review' 'Organization and management'
 'Learning environment' 'Workload']
['Positive' 'Negative' 'Neutral']


In [4]:
# Group by `aspect` and calculate value counts of `sentiment`
sentiment_counts_by_aspect = df.groupby('aspect')['sentiment'].value_counts()

# Print the sentiment counts for each aspect
print("Sentiment Counts by Aspect:\n")
print(sentiment_counts_by_aspect.to_markdown(numalign="left", stralign="left"))

Sentiment Counts by Aspect:

|                                             | count   |
|:--------------------------------------------|:--------|
| ('Course information', 'Negative')          | 14      |
| ('Course information', 'Positive')          | 10      |
| ('Course information', 'Neutral')           | 5       |
| ('General review', 'Neutral')               | 5       |
| ('General review', 'Negative')              | 3       |
| ('General review', 'Positive')              | 2       |
| ('Learning environment', 'Negative')        | 6       |
| ('Learning environment', 'Positive')        | 4       |
| ('Organization and management', 'Negative') | 2       |
| ('Organization and management', 'Neutral')  | 1       |
| ('Support from lecturers', 'Positive')      | 15      |
| ('Support from lecturers', 'Negative')      | 6       |
| ('Teaching quality', 'Positive')            | 49      |
| ('Teaching quality', 'Negative')            | 18      |
| ('Teaching quality', 'Neutral')          

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Đọc dữ liệu
data = pd.read_csv("combined_cleaned_file.csv")

# Tiền xử lý văn bản (ví dụ đơn giản)
data["processed_text"] = data["Sentence Component"].str.lower().str.replace("[^\w\s]", "")

# Vector hóa bằng TF-IDF
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X = tfidf.fit_transform(data["processed_text"])


from sklearn.preprocessing import LabelEncoder

# Mã hóa aspect
aspect_encoder = LabelEncoder()
y_aspect = aspect_encoder.fit_transform(data["aspect"])

# Mã hóa sentiment
sentiment_encoder = LabelEncoder()
y_sentiment = sentiment_encoder.fit_transform(data["sentiment"])

from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Chia dữ liệu thành train/test
X_train, X_test, y_train_aspect, y_test_aspect, y_train_sentiment, y_test_sentiment = train_test_split(
    X,
    y_aspect,
    y_sentiment,
    test_size=0.2,
    random_state=42
)

# Kết hợp y_aspect và y_sentiment thành một mảng 2D
y_train = np.column_stack((y_train_aspect, y_train_sentiment))
y_test = np.column_stack((y_test_aspect, y_test_sentiment))

# print(X_train)
print(y_train)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

# Huấn luyện mô hình
model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
model.fit(X_train, y_train)

# Dự đoán
y_pred = model.predict(X_test)
print(y_pred)

print("\nClassification Report for Sentiment:")
print(classification_report(y_test[:, 1], y_pred[:, 1], target_names=sentiment_encoder.classes_))

# In báo cáo phân loại cho từng nhãn
print("Classification Report for Aspect:")
print(classification_report(y_test[:, 0], y_pred[:, 0], target_names=aspect_encoder.classes_))




[[5 2]
 [4 2]
 [5 0]
 ...
 [0 0]
 [5 2]
 [2 1]]
X_train shape: (6222, 5000)
y_train shape: (6222, 2)
X_test shape: (1556, 5000)
y_test shape: (1556, 2)
[[5 0]
 [4 2]
 [4 2]
 ...
 [4 2]
 [5 2]
 [5 2]]

Classification Report for Sentiment:
              precision    recall  f1-score   support

    Negative       0.84      0.88      0.86       521
     Neutral       0.72      0.61      0.66       232
    Positive       0.91      0.92      0.91       803

    accuracy                           0.86      1556
   macro avg       0.82      0.80      0.81      1556
weighted avg       0.86      0.86      0.86      1556

Classification Report for Aspect:
                             precision    recall  f1-score   support

         Course information       0.66      0.61      0.63       135
             General review       0.53      0.43      0.47       110
       Learning environment       0.82      0.78      0.80        98
Organization and management       0.74      0.50      0.60       106
 

In [8]:
def predict_aspect_sentiment(text):
    # Tiền xử lý
    processed_text = text.lower().replace("[^\w\s]", "")
    # Vector hóa
    text_tfidf = tfidf.transform([processed_text])
    # Dự đoán
    pred = model.predict(text_tfidf)
    aspect = aspect_encoder.inverse_transform([pred[0][0]])[0]
    sentiment = sentiment_encoder.inverse_transform([pred[0][1]])[0]
    return aspect, sentiment

# Test
text = "giáo viên nhiệt tình , tâm huyết , khả năng truyền đạt tốt ."
aspect, sentiment = predict_aspect_sentiment(text)
print(f"Aspect: {aspect}, Sentiment: {sentiment}")

Aspect: Teaching quality, Sentiment: Positive
