<a href="https://colab.research.google.com/github/toan01-uet/sentiment/blob/main/sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier


In [None]:
data = pd.read_csv("/content/drive/MyDrive/DataScience/clean_train_data.csv",usecols = ['comment','label'])
data.head()

Unnamed: 0,comment,label
0,dung dc sp tot cam on shop đóng_gói sản_phẩm đ...,0
1,chất_lượng sản_phẩm tuyệt_vời son mịn đánh màu...,0
2,chất_lượng sản_phẩm tuyệt_vời k hộp k dây giày...,0
3,hơi thất_vọng chút kỳ_vọng sách hi_vọng học_tậ...,1
4,mua áo_gió màu hồng ok đợt giao áo_gió chất vả...,1


In [None]:
data['comment'][0]

'dung dc sp tot cam on shop đóng_gói sản_phẩm đẹp chất_lượng sản_phẩm tuyệt_vời'

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16087 entries, 0 to 16086
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  16049 non-null  object
 1   label    16087 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 251.5+ KB


In [None]:
data = data.dropna()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16049 entries, 0 to 16086
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  16049 non-null  object
 1   label    16049 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 376.1+ KB


In [None]:
data["label"].value_counts()

0    9253
1    6796
Name: label, dtype: int64

In [None]:
print("Maximum review length: {}".format(len(max((data['comment'].values), key=len))))
print("Minimum review length: {}".format(len(min((data['comment'].values), key=len))))
result = [len(x) for x in data['comment'].values]
print("Mean review length: {}".format(np.mean(result)))

Maximum review length: 1392
Minimum review length: 1
Mean review length: 53.209919621160196


In [None]:
vectorizer = TfidfVectorizer(
    # analyzer="word", max_df=0.3, min_df=10, ngram_range=(1, 2), norm="l2"
)
# vectorizer.fit(data["comment"])

In [None]:
# # Vector representation of vocabulary
# word_vector = pd.Series(vectorizer.vocabulary_).sample(5, random_state=1)
# print(f"Unique word (ngram) vector extract:\n\n {word_vector}")

In [None]:
# Sample data - 25% of data to test set
train, test = train_test_split(data, random_state=1, test_size=0.25, shuffle=True)

X_train = train["comment"]
Y_train = train["label"]
X_test = test["comment"]
Y_test = test["label"]
print(X_train.shape)
print(X_test.shape)

(12036,)
(4013,)


In [None]:
# transform each sentence to numeric vector with tf-idf value as elements
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
X_train_vec.get_shape()

(12036, 9383)

In [None]:
# Compare original comment text with its numeric vector representation
print(f"Original sentence:\n{X_train[3:4].values}\n")
# Feature Matrix
features = pd.DataFrame(
    X_train_vec[3:4].toarray(), columns=vectorizer.get_feature_names()
)
nonempty_feat = features.loc[:, (features != 0).any(axis=0)]
print(f"Vector representation of sentence:\n {nonempty_feat}")

Original sentence:
['sản_phẩm đóng_gói chất_lượng kém tiền hài_lòng sản_phẩm tiki tã mỏng ko hàng hãng niềm mua sản_phẩm']

Vector representation of sentence:
    chất_lượng  hài_lòng      hàng  ...      tiền       tã  đóng_gói
0    0.130897  0.250957  0.120297  ...  0.174725  0.42117  0.154599

[1 rows x 14 columns]


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import Perceptron

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import recall_score, precision_score,f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix

lr = LogisticRegression()
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()
svm = LinearSVC()
gau_nb = GaussianNB()
ber_nb = BernoulliNB()
pla = Perceptron()

models = [lr, dtc, svm, rfc, gau_nb, ber_nb, pla]
models_name = ["Logistic Regression", "Decision Tree", "SVM", "Random Forest", "GaussianNB", 
               "BernoulliNB", "Perceptron"]


In [None]:
def evaluate_model(estimator, X, y):
   
    prediction = estimator.predict(X)
    
    model_name = type(estimator).__name__
    return {'name': model_name, 
            'recall': recall_score(y, prediction),
            'precision': precision_score(y, prediction),
            'f1_score':f1_score(y,prediction),
            'confusion_matrix':confusion_matrix(y,prediction)
           }

In [None]:
scores = []
list_model = []
for idx, model in enumerate(models):
    print("Model: {}".format(models_name[idx]))
    
    model.fit(X_train_vec.toarray(), Y_train)
    scores.append(evaluate_model(model, X_test_vec.toarray(), Y_test))
    list_model.append(model)
    print("=======================================")


Model: Logistic Regression
Model: Decision Tree
Model: SVM
Model: Random Forest
Model: GaussianNB
Model: BernoulliNB
Model: Perceptron


In [None]:
df = pd.DataFrame(scores)
df

Unnamed: 0,name,recall,precision,f1_score,confusion_matrix
0,LogisticRegression,0.867323,0.839126,0.852991,"[[2000, 287], [229, 1497]]"
1,DecisionTreeClassifier,0.7781,0.78584,0.781951,"[[1921, 366], [383, 1343]]"
2,LinearSVC,0.852839,0.839225,0.845977,"[[2005, 282], [254, 1472]]"
3,RandomForestClassifier,0.85226,0.843947,0.848083,"[[2015, 272], [255, 1471]]"
4,GaussianNB,0.365006,0.73857,0.488561,"[[2064, 223], [1096, 630]]"
5,BernoulliNB,0.705678,0.857143,0.774071,"[[2084, 203], [508, 1218]]"
6,Perceptron,0.925261,0.732233,0.817507,"[[1703, 584], [129, 1597]]"


In [None]:
# # models to test
# import re
# classifiers = [
#     LogisticRegression(solver="sag", random_state=1),
#     LinearSVC(random_state=1),
#     RandomForestClassifier(random_state=1),
#     XGBClassifier(random_state=1)
# ]
# # get names of the objects in list (too lazy for c&p...)
# names = [re.match(r"[^\(]+", name.__str__())[0] for name in classifiers]
# print(f"Classifiers to test: {names}")

Classifiers to test: ['LogisticRegression', 'LinearSVC', 'RandomForestClassifier', 'XGBClassifier']


In [None]:
# %%time
# import sklearn
# # test all classifiers and save pred. results on test data
# results = {}
# for name, clf in zip(names, classifiers):
#     print(f"Training classifier: {name}")
#     clf.fit(X_train_vec, Y_train)
#     prediction = clf.predict(X_test_vec)
#     report = sklearn.metrics.classification_report(Y_test, prediction)
#     results[name] = report

Training classifier: LogisticRegression
Training classifier: LinearSVC
Training classifier: RandomForestClassifier
Training classifier: XGBClassifier
CPU times: user 12.1 s, sys: 148 ms, total: 12.2 s
Wall time: 12.9 s


In [None]:
# # Prediction results
# for k, v in results.items():
#     print(f"Results for {k}:")
#     print(f"{v}\n")

Results for LogisticRegression:
              precision    recall  f1-score   support

           0       0.90      0.87      0.89      2287
           1       0.84      0.87      0.85      1726

    accuracy                           0.87      4013
   macro avg       0.87      0.87      0.87      4013
weighted avg       0.87      0.87      0.87      4013


Results for LinearSVC:
              precision    recall  f1-score   support

           0       0.89      0.88      0.88      2287
           1       0.84      0.85      0.85      1726

    accuracy                           0.87      4013
   macro avg       0.86      0.86      0.86      4013
weighted avg       0.87      0.87      0.87      4013


Results for RandomForestClassifier:
              precision    recall  f1-score   support

           0       0.89      0.88      0.88      2287
           1       0.84      0.85      0.85      1726

    accuracy                           0.87      4013
   macro avg       0.86      0.87  

**Hype Prameters LogisticRegression**

In [None]:
%%time
import pickle
# feature creation and modelling in a single function
pipe = Pipeline([("tfidf", TfidfVectorizer()), ("lr", LogisticRegression(solver="sag",random_state=1))])

# define parameter space to test # runtime 
params = {
    "tfidf__ngram_range": [(1, 1), (1, 2), (1, 3)],
    "tfidf__max_df": np.arange(0.2, 0.9, 0.1),
    "tfidf__min_df": np.arange(10, 100, 10),
    
}
pipe_clf = GridSearchCV(pipe, params, n_jobs=-1, scoring="f1_macro")
pipe_clf.fit(X_train, Y_train)
pickle.dump(pipe_clf, open("./clf_pipe.pck", "wb"))

CPU times: user 10.2 s, sys: 585 ms, total: 10.8 s
Wall time: 4min 48s


In [None]:
print(pipe_clf.best_params_)

{'tfidf__max_df': 0.30000000000000004, 'tfidf__min_df': 10, 'tfidf__ngram_range': (1, 2)}


In [None]:
%%time
# feature creation and modelling in a single function
pipe = Pipeline([("tfidf", TfidfVectorizer()), ("lr",  LogisticRegression(solver="sag",random_state=1))])

# define parameter space to test # runtime 
params = {
    "tfidf__ngram_range": [(1, 2)],
    "tfidf__max_df": [0.30000000000000004],
    "tfidf__min_df": [10],
   "lr__C": [100, 10, 1.0, 0.1, 0.01],
}
pipe_lr_clf = GridSearchCV(pipe, params, n_jobs=-1, scoring="f1_macro")
pipe_lr_clf.fit(X_train, Y_train)
pickle.dump(pipe_lr_clf, open("./pipe_lr_clf.pck", "wb"))

CPU times: user 630 ms, sys: 21 ms, total: 651 ms
Wall time: 8.54 s


In [None]:
best_params = pipe_lr_clf.best_params_
print(best_params)


{'lr__C': 1.0, 'tfidf__max_df': 0.30000000000000004, 'tfidf__min_df': 10, 'tfidf__ngram_range': (1, 2)}


In [None]:
# run pipe with optimized parameters
pipe.set_params(**best_params).fit(X_train, Y_train)
pipe_pred = pipe.predict(X_test)
report = sklearn.metrics.classification_report(Y_test, pipe_pred)
print(report)

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      2287
           1       0.85      0.87      0.86      1726

    accuracy                           0.88      4013
   macro avg       0.87      0.88      0.88      4013
weighted avg       0.88      0.88      0.88      4013



In [None]:
print("f1_score:\n ",sklearn.metrics.f1_score(Y_test, pipe_pred),"\n")
print("recall_score:\n ",sklearn.metrics.recall_score(Y_test, pipe_pred) ,"\n")
print("precision_score:\n ",sklearn.metrics.precision_score(Y_test, pipe_pred),"\n")

f1_score:
  0.8595088520845232 

recall_score:
  0.8719582850521437 

precision_score:
  0.8474099099099099 



In [None]:
d = {'f1_score': [0.8595], 'recall': [0.8719], 'precision':[0.8474]}
score = pd.DataFrame(d)
score

Unnamed: 0,f1_score,recall,precision
0,0.8595,0.8719,0.8474
