# Train và đánh giá model

In [9]:
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

## Đọc dữ liệu

In [10]:
X_train = pd.read_excel("data/X_train.xlsx")
y_train = pd.read_excel("data/y_train.xlsx")
X_valid = pd.read_excel("data/X_valid.xlsx")
y_valid = pd.read_excel("data/y_valid.xlsx")
X_test = pd.read_excel("data/X_test.xlsx")
y_test = pd.read_excel("data/y_test.xlsx")

## Vectorizer, chạy model, đánh giá và lưu model

In [11]:
encoder = ["CountVertorizer", "Tf-idfVectorizer"]
ngram_range = ["ngram_range_1_1", "ngram_range_1_2"]
for j in encoder:
    if j == "CountVertorizer":
        for k in ngram_range:
            if k == "ngram_range_1_1":
                # Tiền xử lý

                c_vectorizer = CountVectorizer(ngram_range=(1, 1))
                c_vectorizer.fit(X_train["comment"])
                X_train_c = c_vectorizer.transform(X_train["comment"])
                X_valid_c = c_vectorizer.transform(X_valid["comment"])
                X_test_c = c_vectorizer.transform(X_test["comment"])
                        
                #Chạy model
                ##### chỉnh ở đây nè
                model_logis_c = LogisticRegression()
                model_logis_c.fit(X_train_c, y_train)
                y_pred_val_c = model_logis_c.predict(X_valid_c)
                #In điểm model

                print("phương pháp {0}, tham số {1}".format(j, k))
                print(classification_report(y_valid, y_pred_val_c, digits=4))
                #Lưu model
                with open('model_logistic_countvtr_1_1.pkl','wb') as f:
                    pickle.dump(model_logis_c,f)    
                ##### chỉnh ở đây nè
                # parameter grid
                parameters = {
                        'penalty' : ['l1','l2'], 
                        'C'       : np.logspace(-3,3,7),
                        'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
                    }
                clf_c = GridSearchCV(model_logis_c,            # model
                                       param_grid = parameters,   # hyperparameters
                                       scoring= 'f1_macro',        # metric for scoring
                                       cv=10)                     # number of folds
                clf_c.fit(X_train_c, y_train)
                y_pred_grid_val_c = clf_c.predict(X_valid_c)
                print("Phương pháp {0}, tham số {1}, GridSearchCV".format( j, k))
                print(classification_report(y_valid, y_pred_grid_val_c, digits=4))
                #Lưu model
                with open('model_logistic_countvtr_1_1_grid.pkl','wb') as f:
                    pickle.dump(clf_c,f)
            if k == "ngram_range_1_2":
                # Tiền xử lý
                c_vectorizer = CountVectorizer(ngram_range=(1, 2))
                c_vectorizer.fit(X_train["comment"])
                X_train_c = c_vectorizer.transform(X_train["comment"])
                X_valid_c = c_vectorizer.transform(X_valid["comment"])
                X_test_c = c_vectorizer.transform(X_test["comment"])
                        
                #Chạy model
                ##### chỉnh ở đây nè
                model_logis_c = LogisticRegression()
                model_logis_c.fit(X_train_c, y_train)
                y_pred_val_c = model_logis_c.predict(X_valid_c)
                #In điểm model
                    
                print("Phương pháp {0}, tham số {1}".format( j, k))
                print(classification_report(y_valid, y_pred_val_c, digits=4))
                #Lưu model
                with open('model_logistic_countvtr_1_2.pkl','wb') as f:
                    pickle.dump(model_logis_c,f)    
                ##### chỉnh ở đây nè
                # parameter grid
                parameters = {
                        'penalty' : ['l1','l2'], 
                        'C'       : np.logspace(-3,3,7),
                        'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
                    }
                clf_c = GridSearchCV(model_logis_c,            # model
                                       param_grid = parameters,   # hyperparameters
                                       scoring= 'f1_macro',        # metric for scoring
                                       cv=10)                     # number of folds
                clf_c.fit(X_train_c, y_train)
                y_pred_grid_val_c = clf_c.predict(X_valid_c)
                print("Phương pháp {0}, tham số {1}, GridSearchCV".format( j, k))
                print(classification_report(y_valid, y_pred_grid_val_c, digits=4))
                #Lưu model
                with open('model_logistic_countvtr_1_2_grid.pkl','wb') as f:
                    pickle.dump(clf_c,f) 
    if j == "Tf-idfVectorizer":
        for k in ngram_range:
            if k == "ngram_range_1_1":
                # Tiền xử lý
                tf_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
                tf_vectorizer.fit(X_train["comment"])
                X_train_tf = tf_vectorizer.transform(X_train["comment"])
                X_valid_tf = tf_vectorizer.transform(X_valid["comment"])
                X_test_tf = tf_vectorizer.transform(X_test["comment"])
                #Chạy model
                ##### chỉnh ở đây nè
                model_logis_tf = LogisticRegression()
                model_logis_tf.fit(X_train_tf, y_train)
                y_pred_val_tf = model_logis_tf.predict(X_valid_tf)
                #In điểm model
                print("Phương pháp {0}, tham số {1}".format( j, k))
                print(classification_report(y_valid, y_pred_val_tf, digits=4))
                #Lưu model
                with open('model_logistic_tfidf_1_1.pkl','wb') as f:
                    pickle.dump(model_logis_tf,f) 
                ##### chỉnh ở đây nè
                # parameter grid
                parameters = {
                        'penalty' : ['l1','l2'], 
                        'C'       : np.logspace(-3,3,7),
                        'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
                    }
                clf_tf = GridSearchCV(model_logis_tf,            # model
                                           param_grid = parameters,   # hyperparameters
                                           scoring='f1_macro',        # metric for scoring
                                           cv=10)                     # number of folds
                clf_tf.fit(X_train_tf, y_train)
                y_pred_grid_val_tf = clf_tf.predict(X_valid_tf)
                print("Phương pháp {0}, tham số {1}, GridSearchCV".format( j, k))
                print(classification_report(y_valid, y_pred_grid_val_tf, digits=4))
                #Lưu model
                with open('model_logistic_tfidf_1_1_grid.pkl','wb') as f:
                    pickle.dump(clf_tf,f) 
            if k == "ngram_range_1_2":
                # Tiền xử lý
                tf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
                tf_vectorizer.fit(X_train["comment"])
                X_train_tf = tf_vectorizer.transform(X_train["comment"])
                X_valid_tf = tf_vectorizer.transform(X_valid["comment"])
                X_test_tf = tf_vectorizer.transform(X_test["comment"])
                #Chạy model
                ##### chỉnh ở đây nè
                model_logis_tf = LogisticRegression()
                model_logis_tf.fit(X_train_tf, y_train)
                y_pred_val_tf = model_logis_tf.predict(X_valid_tf)
                #In điểm model
                print("Phương pháp {0}, tham số {1}".format( j, k))
                print(classification_report(y_valid, y_pred_val_tf, digits=4))
                #Lưu model
                with open('model_logistic_tfidf_1_2.pkl','wb') as f:
                    pickle.dump(model_logis_tf,f) 
                # parameter grid
                ##### chỉnh ở đây nè
                parameters = {
                            'penalty' : ['l1','l2'], 
                            'C'       : np.logspace(-3,3,7),
                            'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
                    }
                clf_tf = GridSearchCV(model_logis_tf,            # model
                                           param_grid = parameters,   # hyperparameters
                                           scoring='f1_macro',        # metric for scoring
                                           cv=10)                     # number of folds
                clf_tf.fit(X_train_tf, y_train)
                y_pred_grid_val_tf = clf_tf.predict(X_valid_tf)
                print("Phương pháp {0}, tham số {1}, GridSearchCV".format( j, k))
                print(classification_report(y_valid, y_pred_grid_val_tf, digits=4))
                #Lưu model
                with open('model_logistic_tfidf_1_2_grid.pkl','wb') as f:
                    pickle.dump(clf_tf,f) 

phương pháp CountVertorizer, tham số ngram_range_1_1
              precision    recall  f1-score   support

           0     0.8514    0.9231    0.8858      2198
           1     0.7169    0.5473    0.6207       782

    accuracy                         0.8245      2980
   macro avg     0.7842    0.7352    0.7533      2980
weighted avg     0.8161    0.8245    0.8163      2980

Phương pháp CountVertorizer, tham số ngram_range_1_1, GridSearchCV
              precision    recall  f1-score   support

           0     0.8570    0.9186    0.8867      2198
           1     0.7131    0.5691    0.6330       782

    accuracy                         0.8268      2980
   macro avg     0.7851    0.7438    0.7598      2980
weighted avg     0.8192    0.8268    0.8201      2980

Phương pháp CountVertorizer, tham số ngram_range_1_2
              precision    recall  f1-score   support

           0     0.8740    0.9217    0.8973      2198
           1     0.7402    0.6266    0.6787       782

    accur

## Load model và đánh giá trên tập test

### Load model

In [12]:

with open('model_logistic_countvtr_1_1.pkl', 'rb') as f:
    model_logistic_countvtr_1_1  = pickle.load(f)

with open('model_logistic_countvtr_1_1_grid.pkl', 'rb') as f:
    model_logistic_countvtr_1_1_grid  = pickle.load(f)

with open('model_logistic_countvtr_1_2.pkl', 'rb') as f:
    model_logistic_countvtr_1_2  = pickle.load(f)

with open('model_logistic_countvtr_1_2_grid.pkl', 'rb') as f:
    model_logistic_countvtr_1_2_grid  = pickle.load(f)
    
with open('model_logistic_tfidf_1_1.pkl', 'rb') as f:
    model_logistic_tfidf_1_1  = pickle.load(f)

with open('model_logistic_tfidf_1_1_grid.pkl', 'rb') as f:
    model_logistic_tfidf_1_1_grid  = pickle.load(f)

with open('model_logistic_tfidf_1_2.pkl', 'rb') as f:
    model_logistic_tfidf_1_2  = pickle.load(f)

with open('model_logistic_tfidf_1_2_grid.pkl', 'rb') as f:
    model_logistic_tfidf_1_2_grid  = pickle.load(f)

### Model Logistic, CountVectorizer, ngram(1, 1)

In [20]:
c_vectorizer = CountVectorizer(ngram_range=(1, 1))
c_vectorizer.fit(X_train["comment"])
X_test_c = c_vectorizer.transform(X_test["comment"])

y_pred = model_logistic_countvtr_1_1.predict(X_test_c)
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8616    0.9392    0.8987      2187
           1     0.7772    0.5844    0.6671       794

    accuracy                         0.8447      2981
   macro avg     0.8194    0.7618    0.7829      2981
weighted avg     0.8391    0.8447    0.8370      2981




### Model Logistic, CountVectorizer, ngram(1, 1), GridSearchCV

In [21]:
c_vectorizer = CountVectorizer(ngram_range=(1, 1))
c_vectorizer.fit(X_train["comment"])
X_test_c = c_vectorizer.transform(X_test["comment"])

y_pred = model_logistic_countvtr_1_1_grid.predict(X_test_c)
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8646    0.9282    0.8953      2187
           1     0.7520    0.5995    0.6671       794

    accuracy                         0.8407      2981
   macro avg     0.8083    0.7639    0.7812      2981
weighted avg     0.8346    0.8407    0.8345      2981



### Model Logistic, CountVectorizer, ngram(1, 2)

In [22]:
c_vectorizer = CountVectorizer(ngram_range=(1, 2))
c_vectorizer.fit(X_train["comment"])
X_test_c = c_vectorizer.transform(X_test["comment"])

y_pred = model_logistic_countvtr_1_2.predict(X_test_c)
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8757    0.9346    0.9042      2187
           1     0.7790    0.6348    0.6995       794

    accuracy                         0.8547      2981
   macro avg     0.8274    0.7847    0.8019      2981
weighted avg     0.8500    0.8547    0.8497      2981



### Model Logistic, CountVectorizer, ngram(1, 2), GridSearchCV

In [23]:
c_vectorizer = CountVectorizer(ngram_range=(1, 2))
c_vectorizer.fit(X_train["comment"])
X_test_c = c_vectorizer.transform(X_test["comment"])

y_pred = model_logistic_countvtr_1_2_grid.predict(X_test_c)
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8758    0.9351    0.9045      2187
           1     0.7802    0.6348    0.7000       794

    accuracy                         0.8551      2981
   macro avg     0.8280    0.7849    0.8022      2981
weighted avg     0.8503    0.8551    0.8500      2981



### Model Logistic, Tf-idfVectorizer, ngram(1, 1)

In [24]:
tf_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
tf_vectorizer.fit(X_train["comment"])
X_test_tf = tf_vectorizer.transform(X_test["comment"])

y_pred = model_logistic_tfidf_1_1.predict(X_test_tf)
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8489    0.9634    0.9025      2187
           1     0.8397    0.5277    0.6481       794

    accuracy                         0.8474      2981
   macro avg     0.8443    0.7456    0.7753      2981
weighted avg     0.8465    0.8474    0.8348      2981



### Model Logistic, Tf-idfVectorizer, ngram(1, 1), GridSearchCV

In [25]:
tf_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
tf_vectorizer.fit(X_train["comment"])
X_test_tf = tf_vectorizer.transform(X_test["comment"])

y_pred = model_logistic_tfidf_1_1_grid.predict(X_test_tf)
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8600    0.9383    0.8974      2187
           1     0.7731    0.5793    0.6623       794

    accuracy                         0.8427      2981
   macro avg     0.8166    0.7588    0.7799      2981
weighted avg     0.8369    0.8427    0.8348      2981



### Model Logistic, Tf-idfVectorizer, ngram(1, 2)

In [26]:
tf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tf_vectorizer.fit(X_train["comment"])
X_test_tf = tf_vectorizer.transform(X_test["comment"])

y_pred = model_logistic_tfidf_1_2.predict(X_test_tf)
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8229    0.9840    0.8963      2187
           1     0.9044    0.4169    0.5707       794

    accuracy                         0.8329      2981
   macro avg     0.8637    0.7004    0.7335      2981
weighted avg     0.8446    0.8329    0.8096      2981



### Model Logistic, Tf-idfVectorizer, ngram(1, 2), GridSearchCV

In [27]:
tf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tf_vectorizer.fit(X_train["comment"])
X_test_tf = tf_vectorizer.transform(X_test["comment"])


y_pred = model_logistic_tfidf_1_2_grid.predict(X_test_tf)
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8459    0.9662    0.9020      2187
           1     0.8468    0.5151    0.6406       794

    accuracy                         0.8460      2981
   macro avg     0.8463    0.7406    0.7713      2981
weighted avg     0.8461    0.8460    0.8324      2981

