## Movie review tagging model training - Data Science

### Data preparation

#### Preprocessed dataset reading

In [None]:
import numpy as np
import pandas as pd

df_train = pd.read_csv(r'D:\Project\EC - Socail media\ABSA\Me\dataset\DataProcessed\train_processed.csv')
df_test = pd.read_csv(r'D:\Project\EC - Socail media\ABSA\Me\dataset\DataProcessed\test_processed.csv')

In [None]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 477 entries, 0 to 476
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   character      477 non-null    object
 1   content        477 non-null    object
 2   scene          477 non-null    object
 3   sound          477 non-null    object
 4   Cmt            477 non-null    object
 5   processed_cmt  477 non-null    object
dtypes: object(6)
memory usage: 22.5+ KB


In [None]:
df_train = df_train.dropna(how='any')

In [None]:
df_train[df_train["processed_cmt"].isnull()]

Unnamed: 0,character,content,scene,sound,Cmt,processed_cmt


In [None]:
import pickle
y_train = pickle.load(open(r'D:\Project\EC - Socail media\ABSA\Me\dataset\Label\Label\label_aspect_train.pkl', 'rb'))
y_test = pickle.load(open(r'D:\Project\EC - Socail media\ABSA\Me\dataset\Label\Label\label_aspect_test.pkl', 'rb'))

In [None]:
y_train = y_train.drop(index = [154, 2113, 3699])

In [None]:
y_train.to_pickle('label_aspect_train.pkl')
df_train.to_csv('train_processed.csv', encoding = 'utf-8-sig', index = False)

#### Data vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 2), min_df=2, max_df=0.8)

corpus = df_train['processed_cmt']
corpus_test = df_test['processed_cmt']
X_train = tfidf.fit_transform(corpus).toarray()
X_test = tfidf.fit_transform(corpus).toarray()


#### Data split

In [None]:
from sklearn.model_selection import train_test_split


print(X_train.shape), print(y_train.shape)
print(X_test.shape), print(y_test.shape)

(3806, 1000)
(3806, 4)
(3806, 1000)
(477, 4)


(None, None)

In [None]:
y_train

Unnamed: 0,is_character,is_content,is_scene,is_sound
0,0,1,0,1
1,0,0,1,0
2,0,1,0,0
3,0,0,1,0
4,0,1,1,0
...,...,...,...,...
3804,0,0,1,1
3805,0,0,0,0
3806,0,0,0,0
3807,0,0,0,1


### Model Training

In [None]:
from sklearn.metrics import accuracy_score, hamming_loss
from sklearn.model_selection import GridSearchCV

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

best_model_method = None
best_model_params = {}
best_model_f1_score = 0

#### Using BinaryRelevance method

The `BinaryRelevance` method is an approach used in multi-label classification problems, where each instance can belong to multiple classes (labels) simultaneously. It transforms a multi-label problem into multiple independent binary classification problems, one for each label.

How It Works:
- For each label, `Binary Relevance` creates a separate binary classification problem.
- Each classifier predicts whether or not a particular label applies to a given instance, treating each label as independent from the others.
- The final prediction is a combination of all individual binary classifiers’ predictions.

Advantages:
- Simple to implement.
- Works well when the labels are independent of each other.

Limitations:
- Label independence assumption: It assumes that labels are independent, which might not be the case in real-world datasets (some labels might be correlated).
- Requires training multiple classifiers, which can increase computational cost.

In [None]:
from skmultilearn.problem_transform import BinaryRelevance

parameters = [
    {
        'classifier': [MultinomialNB()],
        'classifier__alpha': [0.7, 1.0, 1.3],
    },
    {
        'classifier': [RandomForestClassifier()],
        'classifier__criterion': ['gini', 'entropy'],
        'classifier__n_estimators': [50, 100, 150],
    },
    {
        'classifier': [KNeighborsClassifier()],
        'classifier__n_neighbors': [2, 3, 5, 8, 9],
        'classifier__weights': ['uniform', 'distance'],
    },
    {
        'classifier': [SVC()],
        'classifier__C': [0.01, 0.1, 1.0],
        'classifier__kernel': ['linear', 'rbf', 'sigmoid'],
    },
    {
        'classifier': [LogisticRegression()],
        'classifier__C': [0.01, 0.1, 1.0],
        'classifier__max_iter': [50, 100, 200],
    }
]

scores = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
clf = GridSearchCV(BinaryRelevance(), parameters, scoring=scores, refit=False)
clf.fit(X_train, y_train)

print('\nAccuracy for each parameter set:')
for i in range(len(clf.cv_results_['params'])):
    for scorer in scores:
        mean_score = clf.cv_results_[f'mean_test_{scorer}'][i]
        print(f'Parameter set {i+1}: {clf.cv_results_["params"][i]} - Mean {scorer}: {mean_score}')
    print('\n')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


Accuracy for each parameter set:
Parameter set 1: {'classifier': MultinomialNB(), 'classifier__alpha': 0.7} - Mean f1_micro: 0.657515746524119
Parameter set 1: {'classifier': MultinomialNB(), 'classifier__alpha': 0.7} - Mean f1_macro: 0.644455513292933
Parameter set 1: {'classifier': MultinomialNB(), 'classifier__alpha': 0.7} - Mean precision_micro: 0.7263386406644855
Parameter set 1: {'classifier': MultinomialNB(), 'classifier__alpha': 0.7} - Mean precision_macro: 0.7589880524086785
Parameter set 1: {'classifier': MultinomialNB(), 'classifier__alpha': 0.7} - Mean recall_micro: 0.6007357944567221
Parameter set 1: {'classifier': MultinomialNB(), 'classifier__alpha': 0.7} - Mean recall_macro: 0.5910534498420539


Parameter set 2: {'classifier': MultinomialNB(), 'classifier__alpha': 1.0} - Mean f1_micro: 0.6472962015115628
Parameter set 2: {'classifier': MultinomialNB(), 'classifier__alpha': 1.0} - Mean f1_macro: 0.6313713901308508
Parameter set 2: {'classifier': MultinomialNB(), 'classi

In [None]:
# Display the best parameter and create the best model
best_score_idx = clf.cv_results_['mean_test_f1_micro'].argmax()
best_params = {k.replace('classifier__', ''): v for k, v in clf.cv_results_['params'][best_score_idx].items()}
print(f'Best parameters: {best_params}')

f1_micro_score = clf.cv_results_['mean_test_f1_micro'][best_score_idx]
print('Mean f1_micro score:', f1_micro_score)

if f1_micro_score > best_model_f1_score:
    best_model_method = 'BinaryRelevance'
    best_model_params = best_params
    best_model_f1_score = f1_micro_score

Best parameters: {'classifier': RandomForestClassifier(), 'criterion': 'gini', 'n_estimators': 150}
Mean f1_micro score: 0.7820677676243765


#### Using LabelPowerset method

The `Label Powerset` method is another popular approach used in multi-label classification. Unlike `Binary Relevance`, which treats each label as independent, `Label Powerset` considers the combinations of labels and treats each unique set of labels as a single label in a multi-class classification problem.

How It Works:
- The idea behind `Label Powerset` is to transform the multi-label classification problem into a single-label multi-class classification problem.
- Each unique combination of labels in the dataset is treated as a separate class.
- A single classifier is trained on these unique label combinations.

Advantages:
- Considers label dependencies: By treating label combinations as classes, it captures the relationships between labels.
- Can perform well in cases where certain label combinations are frequent and meaningful.

Limitations:
- Scalability: If there are many possible combinations of labels, the number of classes grows exponentially, which can make the problem very large and computationally expensive.
- Rare combinations: Some combinations of labels might occur very infrequently, which can make the model struggle to generalize to new instances with rare combinations.

In [None]:
from skmultilearn.problem_transform import LabelPowerset

parameters = [
    {
        'classifier': [MultinomialNB()],
        'classifier__alpha': [0.7, 1.0, 1.3],
    },
    {
        'classifier': [RandomForestClassifier()],
        'classifier__criterion': ['gini', 'entropy'],
        'classifier__n_estimators': [50, 100, 150],
    },
    {
        'classifier': [KNeighborsClassifier()],
        'classifier__n_neighbors': [2, 3, 5, 8, 9],
        'classifier__weights': ['uniform', 'distance'],
    },
    {
        'classifier': [SVC()],
        'classifier__C': [0.01, 0.1, 1.0],
        'classifier__kernel': ['linear', 'rbf', 'sigmoid'],
    },
    {
        'classifier': [LogisticRegression()],
        'classifier__C': [0.01, 0.1, 1.0],
        'classifier__max_iter': [50, 100, 200],
    }
]

scores = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
clf = GridSearchCV(LabelPowerset(), parameters, scoring=scores, refit=False)
clf.fit(X_train, y_train)

print('\nAccuracy for each parameter set:')
for i in range(len(clf.cv_results_['params'])):
    for scorer in scores:
        mean_score = clf.cv_results_[f'mean_test_{scorer}'][i]
        print(f'Parameter set {i+1}: {clf.cv_results_["params"][i]} - Mean {scorer}: {mean_score}')
    print('\n')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


Accuracy for each parameter set:
Parameter set 1: {'classifier': MultinomialNB(), 'classifier__alpha': 0.7} - Mean f1_micro: 0.5493733795573894
Parameter set 1: {'classifier': MultinomialNB(), 'classifier__alpha': 0.7} - Mean f1_macro: 0.5486651559389991
Parameter set 1: {'classifier': MultinomialNB(), 'classifier__alpha': 0.7} - Mean precision_micro: 0.826912837452104
Parameter set 1: {'classifier': MultinomialNB(), 'classifier__alpha': 0.7} - Mean precision_macro: 0.8256077698150579
Parameter set 1: {'classifier': MultinomialNB(), 'classifier__alpha': 0.7} - Mean recall_micro: 0.4115974238661323
Parameter set 1: {'classifier': MultinomialNB(), 'classifier__alpha': 0.7} - Mean recall_macro: 0.4184553061784258


Parameter set 2: {'classifier': MultinomialNB(), 'classifier__alpha': 1.0} - Mean f1_micro: 0.49959971588450813
Parameter set 2: {'classifier': MultinomialNB(), 'classifier__alpha': 1.0} - Mean f1_macro: 0.4964185794040675
Parameter set 2: {'classifier': MultinomialNB(), 'clas

In [None]:
# Display the best parameter and create the best model
best_score_idx = clf.cv_results_['mean_test_f1_micro'].argmax()
best_params = {k.replace('classifier__', ''): v for k, v in clf.cv_results_['params'][best_score_idx].items()}
print(f'Best parameters: {best_params}')

f1_micro_score = clf.cv_results_['mean_test_f1_micro'][best_score_idx]
print('Mean f1_micro score:', f1_micro_score)

if f1_micro_score > best_model_f1_score:
    best_model_method = 'LabelPowerset'
    best_model_params = best_params
    best_model_f1_score = f1_micro_score

Best parameters: {'classifier': SVC(), 'C': 1.0, 'kernel': 'linear'}
Mean f1_micro score: 0.6943511150183488


#### Using Classifier Chain method

The `Classifier Chain` is another advanced method used in multi-label classification. It combines the ideas of `Binary Relevance` and `Label Powerset`, capturing both the individual label characteristics and the dependencies between labels.

How It Works:
- `Classifier Chain` transforms the multi-label classification problem into a sequence of binary classification problems.
- Each classifier in the chain predicts whether a specific label applies to an instance, using not only the input features but also the predictions of the previous classifiers in the chain.
- This allows the model to capture dependencies between labels because the prediction for a label can depend on the predictions for other labels made earlier in the chain.

Advantages:
- Captures label dependencies: Unlike Binary Relevance, which treats labels independently, Classifier Chain captures the relationships between labels by conditioning on previously predicted labels.
- Flexible and efficient: It provides a balance between simple methods like Binary Relevance and complex methods like Label Powerset.

Limitations:
- Order of labels matters: The performance of the model can be sensitive to the order in which the labels are processed in the chain. Sometimes, a suboptimal chain order may degrade the performance.
- Computational cost: Since each label prediction depends on previous ones, the process can be slower than Binary Relevance but generally faster than Label Powerset.

In [None]:
from skmultilearn.problem_transform import ClassifierChain

parameters = [
    {
        'classifier': [MultinomialNB()],
        'classifier__alpha': [0.7, 1.0, 1.3],
    },
    {
        'classifier': [RandomForestClassifier()],
        'classifier__criterion': ['gini', 'entropy'],
        'classifier__n_estimators': [50, 100, 150],
    },
    {
        'classifier': [KNeighborsClassifier()],
        'classifier__n_neighbors': [2, 3, 5, 8, 9],
        'classifier__weights': ['uniform', 'distance'],
    },
    {
        'classifier': [SVC()],
        'classifier__C': [0.01, 0.1, 1.0],
        'classifier__kernel': ['linear', 'rbf', 'sigmoid'],
    },
    {
        'classifier': [LogisticRegression()],
        'classifier__C': [0.01, 0.1, 1.0],
        'classifier__max_iter': [50, 100, 200],
    }
]

scores = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
clf = GridSearchCV(ClassifierChain(), parameters, scoring=scores, refit=False)
clf.fit(X_train, y_train)

print('\nAccuracy for each parameter set:')
for i in range(len(clf.cv_results_['params'])):
    for scorer in scores:
        mean_score = clf.cv_results_[f'mean_test_{scorer}'][i]
        print(f'Parameter set {i+1}: {clf.cv_results_["params"][i]} - Mean {scorer}: {mean_score}')
    print('\n')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


Accuracy for each parameter set:
Parameter set 1: {'classifier': MultinomialNB(), 'classifier__alpha': 0.7} - Mean f1_micro: 0.659915041865978
Parameter set 1: {'classifier': MultinomialNB(), 'classifier__alpha': 0.7} - Mean f1_macro: 0.6506164236040645
Parameter set 1: {'classifier': MultinomialNB(), 'classifier__alpha': 0.7} - Mean precision_micro: 0.7167178200891732
Parameter set 1: {'classifier': MultinomialNB(), 'classifier__alpha': 0.7} - Mean precision_macro: 0.7395503262273451
Parameter set 1: {'classifier': MultinomialNB(), 'classifier__alpha': 0.7} - Mean recall_micro: 0.6116558674242072
Parameter set 1: {'classifier': MultinomialNB(), 'classifier__alpha': 0.7} - Mean recall_macro: 0.6033707144755384


Parameter set 2: {'classifier': MultinomialNB(), 'classifier__alpha': 1.0} - Mean f1_micro: 0.6548772575312118
Parameter set 2: {'classifier': MultinomialNB(), 'classifier__alpha': 1.0} - Mean f1_macro: 0.6444302123477202
Parameter set 2: {'classifier': MultinomialNB(), 'class

In [None]:
# Display the best parameter and create the best model
best_score_idx = clf.cv_results_['mean_test_f1_micro'].argmax()
best_params = {k.replace('classifier__', ''): v for k, v in clf.cv_results_['params'][best_score_idx].items()}
print(f'Best parameters: {best_params}')

f1_micro_score = clf.cv_results_['mean_test_f1_micro'][best_score_idx]
print('Mean f1_micro score:', f1_micro_score)

if f1_micro_score > best_model_f1_score:
    best_model_method = 'ClassifierChain'
    best_model_params = best_params
    best_model_f1_score = f1_micro_score

Best parameters: {'classifier': RandomForestClassifier(), 'criterion': 'gini', 'n_estimators': 150}
Mean f1_micro score: 0.781287681264449


#### Getting the best model and performing some predictions

In [None]:
params = {key: value for key, value in best_model_params.items() if key != 'classifier'}
BaseClassifier = best_model_params['classifier']
BaseClassifier.set_params(**params)

if best_model_method == 'BinaryRelevance':
    best_model = BinaryRelevance(BaseClassifier)
elif best_model_method == 'LabelPowerset':
    best_model = LabelPowerset(BaseClassifier)
elif best_model_method == 'ClassifierChain':
    best_model = ClassifierChain(BaseClassifier)
else:
    raise TimeoutError('Unable to find the best model')

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

accuracy_value = accuracy_score(y_test, y_pred)
hamming_loss_value = hamming_loss(y_test, y_pred)

print(best_model)
print('Micro F1 score:', best_model_f1_score)
print('Accuracy score:', accuracy_value)
print('Hamming loss:', hamming_loss_value)

ValueError: Found input variables with inconsistent numbers of samples: [477, 3806]

In [None]:
from sklearn.metrics import classification_report, accuracy_score, hamming_loss

# Lựa chọn mô hình tốt nhất từ tham số
params = {key: value for key, value in best_model_params.items() if key != 'classifier'}
BaseClassifier = best_model_params['classifier']
BaseClassifier.set_params(**params)

if best_model_method == 'BinaryRelevance':
    best_model = BinaryRelevance(BaseClassifier)
elif best_model_method == 'LabelPowerset':
    best_model = LabelPowerset(BaseClassifier)
elif best_model_method == 'ClassifierChain':
    best_model = ClassifierChain(BaseClassifier)
else:
    raise TimeoutError('Unable to find the best model')

# Huấn luyện mô hình
best_model.fit(X_train, y_train)

# Dự đoán trên tập kiểm tra
y_pred = best_model.predict(X_test)

# Đánh giá kết quả
accuracy_value = accuracy_score(y_test, y_pred)
hamming_loss_value = hamming_loss(y_test, y_pred)

# Tạo báo cáo đầy đủ
classification_report_str = classification_report(y_test, y_pred, zero_division=0)  # zero_division để tránh lỗi chia 0

# In ra các kết quả
print("Best Model:", best_model)
print("Micro F1 score:", best_model_f1_score)
print("Accuracy score:", accuracy_value)
print("Hamming loss:", hamming_loss_value)
print("\nClassification Report:")
print(classification_report_str)


### Model exportation using Joblib

In [None]:
import joblib

best_model.save('best_model.h5')

['Review_Tagger_Model.pkl']
