# 4. TRAINING MODEL

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score, recall_score, roc_auc_score, accuracy_score, average_precision_score, confusion_matrix, ConfusionMatrixDisplay

In [2]:
df = pd.read_csv('./data_files/shopee_final.csv')

In [3]:
df['class'] = df['class'].apply(lambda x: 0 if x == 'neg' else 1 )
df.head()

Unnamed: 0,processed_comment,class
0,đặt hôm_qua nay hàng đồng_hồ siêu_đẹp chất_lượ...,1
1,sản_phẩm chính hãng phiếu bảo_hành tem chống g...,1
2,xuất_sắc shop confirm đóng_gói hàng nhanh kiểu...,1
3,săn sale giá hời_áp xu quá rẻ đầy_đủ tem chống...,1
4,đồng_hồ siêu_đẹp giao hàng siêu nhanh shop qua...,1


In [4]:
df.shape

(289033, 2)

In [5]:
df['class'].value_counts()

class
1    266486
0     22547
Name: count, dtype: int64

# Train test split
Ta thấy rằng giữa hai nhóm positive và negative có chênh lệnh lớn, nên tập train data của ta sẽ bằng:

min(size(positive)) * 0.8, size(negative) * 0.2

In [6]:
half_min_size = min(df['class'].value_counts())
half_min_size

22547

In [7]:
from sklearn.utils import shuffle
import random

In [8]:
df_pos = df[df['class'] == 1]
df_neg = df[df['class'] == 0]

# Trộn dữ liệu của tập positive
df_pos = shuffle(df_pos)
df_pos = df_pos.reset_index(drop=True)

In [9]:
# Lấy index phần dữ liệu của tập Positive bằng với tập Negative
positive_index = random.sample(range(0, df_pos.shape[0]), half_min_size)
positive_index[:10]

[43150, 248460, 7545, 117091, 139478, 10193, 226721, 235207, 256519, 225669]

In [10]:
df_pos2 = df_pos.iloc[positive_index,:]
df_pos2.head()

Unnamed: 0,processed_comment,class
43150,đẹp nhức nách êm_bền đôi mẹ tuyệt_vời mẹ màu,1
248460,tất rất đẹp giao hàng nhanh đóng_gói chắc_chắn...,1
7545,giá tiền quá ok shop giao hàng nhanh thân_thiện,1
117091,kẹp vừa chắc_chắn màu xinh_dễ dùng giá rẻ hơn ...,1
139478,khuyên bạn nên cửa_hàng giao hàng rất nhanh tấ...,1


In [11]:
# Kết hợp tập positive đã được chia và tập negative lại với nhau
df1 = pd.concat([df_neg, df_pos2], axis=0)
df1 = (shuffle(df1)).reset_index(drop=True)
df1.shape

(45094, 2)

In [12]:
df1.head()

Unnamed: 0,processed_comment,class
0,mua lan shop qua uy_tin san_pham tot shop,1
1,nhận hàng giống mô_tả cả nhà mua giờ mua nên k...,1
2,bây bây_đứt mang cf đúng một tý sáng ngày đầu_...,0
3,áo đẹp mẫu_mã ok_dáng rộng mặc ôm pải sửa,1
4,vải mỏng nhẹ lớp lót xột xạt không im lắm mua ...,0


In [13]:
# Ghi ra file
df1.to_csv("./data_files/train_comments.csv", index=False)

In [14]:
X = df1['processed_comment']
y = df1['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

display(X_train, y_train)

114797                         đẹp to giao tới nhìn bẩn lắm
257915    da mềm trắng_túi nhỏ xinh vừa túi_tiền giao hà...
111243    ổn đúng ổn đẹp keo chắc rớt charm_hơi buồn giá...
244021    ví da chất_lượng gí rẻ ví xịn nên mua đóng_gói...
31424     đen đúng ok giày giá rẻ mang ôm chân rất thích...
                                ...                        
119879                                  giao đủ hài_lòng ok
259178                 đúng đen hàng giao tết chất_lượng ổn
131932    săn giá ok bộ_đồ khá ổn vải chắc không thấm hú...
146867    sản_phẩm hình chất_lượng mặc thoải_mái giao hà...
121958                                  ủa m thằng chủ shop
Name: processed_comment, Length: 231226, dtype: object

114797    1
257915    1
111243    0
244021    1
31424     1
         ..
119879    1
259178    1
131932    1
146867    1
121958    0
Name: class, Length: 231226, dtype: int64

# Bag of Words Vectorizer

In [16]:
# Bag of Words
bow = CountVectorizer(ngram_range=(1,2),min_df=0.02)

vect = bow.fit(X_train)
X_train = bow.transform(X_train)

df_bow = pd.DataFrame(data=X_train.toarray(), columns=bow.get_feature_names_out())
df_bow.head()

Unnamed: 0,biết,bạn,chuẩn,chân,chưa,chất,chất vải,chất_lượng,chắc_chắn,cả,...,đúng mô_tả,đặt,đẹp,đẹp lắm,đồng_hồ,ưng,ảnh,ổn,ủng_hộ,ủng_hộ shop
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,2,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [17]:
# Cross validation
num_folds = 5
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

In [None]:
# Model Comparision
models = [MultinomialNB(), LogisticRegression(multi_class='auto'), SVC()]
results = pd.DataFrame(columns=['Model', 'Accuracy Mean', 'Accuracy STD', 'Time'])

for idx, model in enumerate(models):
    start_time = time.time()
    cv_results = cross_val_score(model, X_train, y_train, cv=skf, scoring='accuracy')
    train_time = time.time() - start_time
    df_newrow = pd.DataFrame.from_dict({idx: [type(model).__name__, cv_results.mean(),  cv_results.std(), train_time]},
                                    orient='index', columns=['Model', 'Accuracy Mean', 'Accuracy STD','Time'])
    results = pd.concat([results, df_newrow])

In [None]:
results = results.sort_values(by='Accuracy Mean', ascending=False).reset_index(drop=True)
results

In [None]:
plt.figure(figsize=(10, 8))
sns.barplot(x='Accuracy Mean', y='Model', data=results, palette='Set3')
plt.title('Model Comparison')

# TF-IDF Vectorizer

In [77]:
tfidf = TfidfVectorizer(ngram_range=(1,2),min_df=0.02)

vect = tfidf.fit(X_train)
X_train = tfidf.transform(X_train)

df_tfidf = pd.DataFrame(data=X_train.toarray(), columns=tfidf.get_feature_names_out())
df_tfidf.head()

Unnamed: 0,biết,bạn,chuẩn,chân,chưa,chất,chất vải,chất_lượng,chắc_chắn,cả,...,đúng mô_tả,đặt,đẹp,đẹp lắm,đồng_hồ,ưng,ảnh,ổn,ủng_hộ,ủng_hộ shop
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.263765,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.194897,0.0,0.0,0.0,0.0,0.755668,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.228782,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.261185,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.255817,0.0,0.0,0.0


In [None]:
# Cross validation
num_folds = 5
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

In [None]:
# Model Comparision
models = [MultinomialNB(), LogisticRegression(multi_class='auto'), SVC()]
results = pd.DataFrame(columns=['Model', 'Accuracy Mean', 'Accuracy STD', 'Time'])

for idx, model in enumerate(models):
    start_time = time.time()
    cv_results = cross_val_score(model, X_train, y_train, cv=skf, scoring='accuracy')
    train_time = time.time() - start_time
    df_newrow = pd.DataFrame.from_dict({idx: [type(model).__name__, cv_results.mean(),  cv_results.std(), train_time]},
                                    orient='index', columns=['Model', 'Accuracy Mean', 'Accuracy STD','Time'])
    results = pd.concat([results, df_newrow])

In [None]:
results = results.sort_values(by='Accuracy Mean', ascending=False).reset_index(drop=True)
results

In [None]:
plt.figure(figsize=(10, 8))
sns.barplot(x='Accuracy Mean', y='Model', data=results, palette='Set3')
plt.title('Model Comparison')

# Build Model

In [None]:
# Hyperparameter tuning
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['none', 'l1', 'l2', 'elasticnet']
c_values = [100, 10, 1.0, 0.1, 0.01, 0.001]

grid = dict(solver=solvers,penalty=penalty,C=c_values)

clf_grid = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1,cv=5, scoring='accuracy', error_score=0)
search_clf = clf_grid.fit(X_train, y_train)
best_clf = search_clf.best_estimator_

In [None]:
print('Best params:', search_clf.best_params_)

In [None]:
# Xây dựng mô hình
tfidf = TfidfVectorizer(ngram_range=(1,2),min_df=0.02)

tfidf.fit(X_train)
X_train = tfidf.transform(X_train)
X_test = tfidf.transform(X_test)
best_model = LogisticRegression(C=0.1,penalty='l2',solver='newton-cg')

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

In [None]:
# Ma trận đánh giá
accuracy_train = round(best_model.score(X_train, y_train) * 100, 3)
accuracy_test = round(best_model.score(X_test, y_test) * 100, 3)
print('Accuracy score on the training data: ', accuracy_train)
print('Accuracy score on the test data: ', accuracy_test)

In [None]:
print(classification_report(y_test,y_pred))
ConfusionMatrixDisplay.from_predictions(y_test,y_pred);

In [None]:
pkl_filename = "sentiment_bestmodel.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(best_model, file)