### Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from lazypredict.Supervised import LazyClassifier

In [2]:
data = pd.read_csv("shopee_final.csv", encoding='utf-8')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 934543 entries, 0 to 934542
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Unnamed: 0      934543 non-null  int64 
 1   processed_text  900874 non-null  object
 2   class           934543 non-null  object
dtypes: int64(1), object(2)
memory usage: 21.4+ MB


In [3]:
data['class'].value_counts()

positive    636052
negative    186399
neutral     112092
Name: class, dtype: int64

In [4]:
df_sub= data.sample(frac=0.02)

In [5]:
source = df_sub['processed_text']
type(source)

pandas.core.series.Series

In [6]:
source[:5]

868806      quần không_đúng cạp mặc không_đẹp hành
307729                hộp xịn vải dày hình xịn yêu
47537                                   vít ốc đít
598659    chất_lượng rất_tôn dáng mặc tiền tặng áo
595709                sản_phẩm đóng_gói chất_lượng
Name: processed_text, dtype: object

In [7]:
target = df_sub['class']
type(target)

pandas.core.series.Series

### Lazy Predict with 3 class

In [9]:
target = target.replace("negative", 0)

In [10]:
target = target.replace("neutral", 1)

In [11]:
target = target.replace("positive", 2)

In [12]:
target[:5]

868806    0
307729    2
47537     1
598659    2
595709    2
Name: class, dtype: int64

### tf_idf

In [14]:
# Create a TfidfVectorizer object
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=0.02)

In [15]:
# Fit the TfidfVectorizer object to the data
x = tfidf_vectorizer.fit(df_sub['processed_text'].values.astype('U'))

In [16]:
# Transform the data
X = tfidf_vectorizer.transform(df_sub['processed_text'].values.astype('U'))

In [17]:
### BUG: 
# This is probably because you are using an older scikit-learn version than the one this code was written for.
# get_feature_names_out is a method of the class sklearn.feature_extraction.text.TfidfVectorizer since scikit-learn 1.0. Previously, there was a similar method called get_feature_names.
# So you should update your scikit-learn package, or use the old method (not recommended).
# Create a new dataframe with the transformed data
df_new = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [18]:
X = X.toarray()

In [19]:
y = np.array(target)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20) 

In [21]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

100%|███████████████████████████████████████████| 29/29 [01:06<00:00,  2.29s/it]


In [22]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BernoulliNB,0.57,0.54,,0.59,0.04
LabelSpreading,0.71,0.54,,0.7,10.02
LabelPropagation,0.71,0.54,,0.7,4.88
RandomForestClassifier,0.73,0.53,,0.71,1.31
DecisionTreeClassifier,0.7,0.53,,0.7,0.11
NearestCentroid,0.47,0.53,,0.5,0.04
CategoricalNB,0.51,0.53,,0.54,0.06
LGBMClassifier,0.73,0.53,,0.71,0.62
ExtraTreesClassifier,0.72,0.53,,0.7,1.2
BaggingClassifier,0.72,0.53,,0.7,0.75


### lazy predict with 2 class

In [24]:
df_sub2 = df_sub

In [25]:
df_sub2['class'] = df_sub2['class'].apply(lambda x: 0 if x == 'negative' or x== 'neutral' else 1 )

In [26]:
df_sub2['class'].value_counts()

1    12731
0     5960
Name: class, dtype: int64

In [27]:
target2 = df_sub2['class']

In [28]:
# Fit the TfidfVectorizer object to the data
x_2 = tfidf_vectorizer.fit(df_sub2['processed_text'].values.astype('U'))

In [29]:
# Transform the data
X2 = tfidf_vectorizer.transform(df_sub2['processed_text'].values.astype('U'))

In [30]:
X2 = X2.toarray()

In [31]:
y2 = np.array(target2)

In [32]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, 
                                                    test_size=0.20) 

In [33]:
models2, predictions2 = clf.fit(X_train2, X_test2, y_train2, y_test2)

'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|███████████████████████████████████████████| 29/29 [00:55<00:00,  1.92s/it]


In [34]:
models2

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SVC,0.78,0.75,0.75,0.78,12.14
ExtraTreesClassifier,0.77,0.75,0.75,0.77,1.2
RandomForestClassifier,0.77,0.75,0.75,0.77,1.39
XGBClassifier,0.77,0.75,0.75,0.77,0.77
LGBMClassifier,0.78,0.75,0.75,0.78,0.39
BaggingClassifier,0.77,0.75,0.75,0.77,0.63
DecisionTreeClassifier,0.76,0.74,0.74,0.76,0.12
LabelSpreading,0.76,0.74,0.74,0.76,9.23
LabelPropagation,0.76,0.74,0.74,0.76,5.09
AdaBoostClassifier,0.77,0.74,0.74,0.77,0.52


### Resampling

In [41]:
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE

In [48]:
smt = SMOTE()
X_smt, y_smt = smt.fit_resample(X,y)

In [49]:
X_train_smt, X_test_smt, y_train_smt, y_test_smt = train_test_split(X_smt, y_smt, 
                                                    test_size=0.20) 

### SMOTE for 3 classes

In [52]:
models_smt, predictions_smt = clf.fit(X_train_smt, X_test_smt, y_train_smt, y_test_smt)

'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|███████████████████████████████████████████| 29/29 [06:54<00:00, 14.29s/it]


In [53]:
models_smt

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ExtraTreesClassifier,0.69,0.69,,0.69,2.89
RandomForestClassifier,0.68,0.68,,0.68,3.26
LabelPropagation,0.68,0.68,,0.68,22.22
LabelSpreading,0.68,0.68,,0.68,60.29
BaggingClassifier,0.67,0.68,,0.68,1.51
XGBClassifier,0.66,0.66,,0.66,4.04
ExtraTreeClassifier,0.66,0.66,,0.66,0.09
DecisionTreeClassifier,0.66,0.66,,0.66,0.26
LGBMClassifier,0.66,0.66,,0.66,0.79
NuSVC,0.65,0.65,,0.65,153.41


In [54]:
X_smt2, y_smt2 = smt.fit_resample(X2,y2)

In [55]:
X_train_smt2, X_test_smt2, y_train_smt2, y_test_smt2 = train_test_split(X_smt2, y_smt2, 
                                                    test_size=0.20) 

In [56]:
models_smt2, predictions_smt2 = clf.fit(X_train_smt2, X_test_smt2, y_train_smt2, y_test_smt2)

'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|███████████████████████████████████████████| 29/29 [01:47<00:00,  3.70s/it]


### SMOTE for 2 CLasses

In [57]:
models_smt2

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ExtraTreesClassifier,0.8,0.8,0.8,0.8,1.89
RandomForestClassifier,0.8,0.8,0.8,0.79,2.03
LGBMClassifier,0.79,0.79,0.79,0.79,0.35
XGBClassifier,0.79,0.79,0.79,0.79,0.98
BaggingClassifier,0.79,0.79,0.79,0.79,0.96
LabelPropagation,0.78,0.79,0.79,0.78,9.85
LabelSpreading,0.78,0.79,0.79,0.78,19.87
SVC,0.78,0.79,0.79,0.78,20.12
ExtraTreeClassifier,0.78,0.78,0.78,0.78,0.08
DecisionTreeClassifier,0.78,0.78,0.78,0.78,0.2


## Kết luận

- đối với mô hình sử dụng 3 biến, ta sẽ không sử dụng resampling do kết quả không đuọc cải thiện
- đối với mô hình 2 biến thì chúng ta sẽ resampling do kết quae được cải thiện rất tốt

#### Model preslection
- dựa trên tiêu chi theo thứ tự Accuracy và time, sau đó là ROC và F1

##### đối với 3 class
- RandomForestClassifier
- KNeighborsClassifier
- LogisticRegression

##### đối với 2 class
- ExtraTreesClassifier
- XGBClassifier
- LogisticRegression
- KNeighborsClassifier