In [1]:
import pandas as pd
import numpy as np
import re
import pickle
from collections import Counter
from underthesea import word_tokenize
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from datetime import datetime
import warnings

warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('Data_pre.csv', index_col=[0])
data.head()

Unnamed: 0,comment,comment_pre,target
0,chưa ktra nên chưa biết đc đứa nào cũng đc như...,kiểm,2
1,Lời đầu tiên cho phép được xin vì sự sơ suất c...,cho_phép sơ_suất hát rõ_ràng,2
2,1m6 50kg size M khá vừa vặn nhưng hok có cơ nê...,vừa_vặn đỏ đỏ cổ đẹp nhức đẹp ổn,2
3,"Lần đầu mua shop, vải cũg khá ok. hợp vs mùa h...",đầu_shop hợp,2
4,"áo đẹp form đẹp, mọi thứ đều ok, giao hàng nha...",đẹp form đẹp hàng thích,2


In [3]:
# Drop NaN values
data = data.dropna().reset_index(drop=True)

In [4]:
data.shape

(445354, 3)

### TF-IDF

In [5]:
tf_idf = TfidfVectorizer(ngram_range=(1, 4))
tf_idf.fit(data['comment_pre'])

### Creating input X and output y

In [6]:
X = tf_idf.transform(data['comment_pre'])
y = data['target']

### Train test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Build model with Logistic Regression

In [8]:
start = datetime.now()

In [9]:
model = LogisticRegression().fit(X_train, y_train)

In [10]:
end = datetime.now()
time_taken = end - start
print('Time: ', time_taken) 

Time:  0:00:57.151198


In [11]:
y_hat = model.predict(X_test)

In [12]:
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.73      0.59      0.65      7641
           1       0.77      0.27      0.40      5802
           2       0.92      0.98      0.95     75628

    accuracy                           0.90     89071
   macro avg       0.81      0.62      0.67     89071
weighted avg       0.89      0.90      0.89     89071



In [13]:
model.score(X_train, y_train)

0.9166280737503614

In [14]:
model.score(X_test, y_test)

0.9036386702742756

In [15]:
confusion_matrix(y_test, y_hat)

array([[ 4533,   128,  2980],
       [  819,  1569,  3414],
       [  900,   342, 74386]], dtype=int64)

- Logistic Regression with original data has 90% accuracy
- Recall and f1 score are low

### Build model with Naive Bayes

In [16]:
start = datetime.now()

In [17]:
grid = {
    "alpha": list(range(1, 11))
}

In [18]:
nb_cv = GridSearchCV(MultinomialNB(), grid, cv=10).fit(X_train, y_train)

In [19]:
print("Best parameters: ", nb_cv.best_params_)
print("Accuracy:", nb_cv.best_score_)

Best parameters:  {'alpha': 1}
Accuracy: 0.8773362662160288


In [20]:
model = MultinomialNB(alpha=1.0).fit(X_train, y_train)

In [21]:
end = datetime.now()
time_taken = end - start
print('Time: ', time_taken) 

Time:  0:00:20.043353


In [22]:
model.score(X_train, y_train)

0.8839742564197562

In [23]:
model.score(X_test, y_test)

0.8769970023913507

In [24]:
y_hat = model.predict(X_test)

In [25]:
confusion_matrix(y_test, y_hat)

array([[ 2378,     2,  5261],
       [   66,   196,  5540],
       [   80,     7, 75541]], dtype=int64)

In [26]:
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.94      0.31      0.47      7641
           1       0.96      0.03      0.07      5802
           2       0.87      1.00      0.93     75628

    accuracy                           0.88     89071
   macro avg       0.92      0.45      0.49     89071
weighted avg       0.89      0.88      0.84     89071



- Naive Bayes with original data has 88% accuracy
- Recall and f1-score are low
- Precision is high

### Build model with Decision Tree

In [27]:
start = datetime.now()

In [28]:
model = DecisionTreeClassifier().fit(X_train, y_train)

In [29]:
end = datetime.now()
time_taken = end - start
print('Time: ', time_taken) 

Time:  0:31:32.811327


In [30]:
model.score(X_train, y_train)

0.9621171933547208

In [31]:
model.score(X_test, y_test)

0.8965207531070719

In [32]:
y_hat = model.predict(X_test)

In [33]:
confusion_matrix(y_test, y_hat)

array([[ 4949,   399,  2293],
       [  984,  1946,  2872],
       [ 1556,  1113, 72959]], dtype=int64)

In [34]:
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.66      0.65      0.65      7641
           1       0.56      0.34      0.42      5802
           2       0.93      0.96      0.95     75628

    accuracy                           0.90     89071
   macro avg       0.72      0.65      0.67     89071
weighted avg       0.89      0.90      0.89     89071



- Decision Tree with original data has 90% accuracy
- Precision, recall and f1-score are low

### Oversampling

In [35]:
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(X, y)

In [36]:
Counter(y_ros)

Counter({2: 378476, 1: 378476, 0: 378476})

### Train test split

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X_ros, y_ros, test_size=0.2)

### Build model with Logistic Regression using oversampled data

In [38]:
start = datetime.now()

In [39]:
model = LogisticRegression().fit(X_train, y_train)

In [40]:
end = datetime.now()
time_taken = end - start
print('Time: ', time_taken) 

Time:  0:01:15.378784


In [41]:
y_hat = model.predict(X_test)

In [42]:
model.score(X_train, y_train)

0.8573026459197086

In [43]:
model.score(X_test, y_test)

0.8472340875263116

In [44]:
confusion_matrix(y_test, y_hat)

array([[65661,  6638,  3480],
       [ 8133, 60482,  6987],
       [ 3224,  6229, 66252]], dtype=int64)

In [45]:
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.85      0.87      0.86     75779
           1       0.82      0.80      0.81     75602
           2       0.86      0.88      0.87     75705

    accuracy                           0.85    227086
   macro avg       0.85      0.85      0.85    227086
weighted avg       0.85      0.85      0.85    227086



- Logistic regression with oversampled data has 85% accuracy
- Precision, recall and f1-score have improved compared to the original data

### Build model with Naive Bayes using oversampled data

In [46]:
start = datetime.now()

In [47]:
grid = {
    "alpha": list(range(1, 11))
}

In [48]:
nb_cv = GridSearchCV(MultinomialNB(), grid, cv=10).fit(X_train, y_train)

In [49]:
print("Best parameters: ", nb_cv.best_params_)
print("Accuracy:", nb_cv.best_score_)

Best parameters:  {'alpha': 1}
Accuracy: 0.8565639362198606


In [50]:
model = MultinomialNB(alpha=1.0).fit(X_train, y_train)

In [51]:
end = datetime.now()
time_taken = end - start
print('Time: ', time_taken) 

Time:  0:00:39.930013


In [52]:
model.score(X_train, y_train)

0.8744867021452273

In [53]:
model.score(X_test, y_test)

0.8580493733651569

In [54]:
y_hat = model.predict(X_test)

In [55]:
confusion_matrix(y_test, y_hat)

array([[67692,  5832,  2255],
       [ 7219, 63336,  5047],
       [ 3015,  8867, 63823]], dtype=int64)

In [56]:
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88     75779
           1       0.81      0.84      0.82     75602
           2       0.90      0.84      0.87     75705

    accuracy                           0.86    227086
   macro avg       0.86      0.86      0.86    227086
weighted avg       0.86      0.86      0.86    227086



- Naive Bayes with oversampled data has 86% accuracy
- Precision, recall and f1-score have improved compared to original data

### Build model with Decision Tree using oversampled data

In [57]:
start = datetime.now()

In [58]:
model = DecisionTreeClassifier().fit(X_train, y_train)

In [59]:
end = datetime.now()
time_taken = end - start
print('Time: ', time_taken) 

Time:  0:44:48.028861


In [60]:
model.score(X_train, y_train)

0.9073928101970403

In [61]:
model.score(X_test, y_test)

0.8863866552759747

In [62]:
y_hat = model.predict(X_test)

In [63]:
confusion_matrix(y_test, y_hat)

array([[69656,  4693,  1430],
       [ 6575, 65053,  3974],
       [ 3717,  5411, 66577]], dtype=int64)

In [64]:
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.87      0.92      0.89     75779
           1       0.87      0.86      0.86     75602
           2       0.92      0.88      0.90     75705

    accuracy                           0.89    227086
   macro avg       0.89      0.89      0.89    227086
weighted avg       0.89      0.89      0.89    227086



- Decision Tree with oversampled data has 89% accuracy
- Precision, recall and f1-score have improved compared to original data
- It took almost 50 minutes to fit a model

### To conclude:
- Original data has high accuracy but precision, recall and f1 score are pretty low
- When using oversampled data, the accuracy is not as high but precision, recall and f1 score have improved a lot.
- So we should use oversampled data to build a model and predict, preferably choosing decision tree model with accuracy of 89% 