In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split  
from sklearn import metrics
import time
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import FunctionTransformer

In [3]:
%cd /content/drive/MyDrive/colab_notebook

/content/drive/MyDrive/colab_notebook


In [4]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')


def convert_star(star): 
    if star == 'bad': 
        return 0
    elif star  == 'neutral': 
        return 1
    return 2

train_df['label'] = train_df['label'].apply(convert_star)
test_df['label'] = test_df['label'].apply(convert_star)

In [5]:
X_train = train_df['comment']
y_train = train_df['label']

X_test = test_df['comment']
y_test = test_df['label']

In [6]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)

In [7]:
tfidf_transformer = TfidfTransformer()
X_train = tfidf_transformer.fit_transform(X_train)

In [8]:
X_test = vectorizer.transform(X_test)
X_test = tfidf_transformer.fit_transform(X_test)

In [9]:
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
print(f'Số lượng mẫu của tập huấn luyện sau khi xử lý mất cân bằng là {len(y_resampled)}')
X_train, y_train = X_resampled, y_resampled

Số lượng mẫu của tập huấn luyện sau khi xử lý mất cân bằng là 54279


### Naive Bayes

In [10]:
nb = MultinomialNB()

start_time = time.time()
nb.fit(X_train, y_train)
train_time_nb = time.time() - start_time
y_pred_nb = nb.predict(X_test)

In [11]:
entries = []
entries.append(['Naive Bayes', accuracy_score(y_test,y_pred_nb),
                    precision_score(y_test, y_pred_nb, average = 'macro'), recall_score(y_test, y_pred_nb, average = 'macro'),
                    f1_score(y_test, y_pred_nb, average = 'macro'), train_time_nb])

### Logistic Regression

In [12]:
log = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=10000)

start_time = time.time()
log.fit(X_train, y_train)
train_time_lr = time.time() - start_time
y_pred_lr = log.predict(X_test)

In [13]:
entries.append(['Logistic Regression',accuracy_score(y_test,y_pred_lr),
                   precision_score(y_test, y_pred_lr, average = 'macro'), recall_score(y_test, y_pred_lr, average = 'macro'),
                    f1_score(y_test, y_pred_lr, average = 'macro'), train_time_lr])

### Decision Tree

In [14]:
tree = DecisionTreeClassifier()

start_time = time.time()
tree.fit(X_train, y_train)
train_time_dt = time.time() - start_time
y_pred_dt = tree.predict(X_test)

In [15]:
entries.append(['Decision Tree',accuracy_score(y_test,y_pred_dt),
                    precision_score(y_test, y_pred_dt, average = 'macro'), recall_score(y_test, y_pred_dt, average = 'macro'),
                    f1_score(y_test, y_pred_dt, average = 'macro'), train_time_dt])

### SGD Classifier

In [16]:
sgd = SGDClassifier()

start_time = time.time()
sgd.fit(X_train, y_train)
train_time_sgd = time.time() - start_time
y_pred_sgd = sgd.predict(X_test)

In [17]:
entries.append(['SGD Classifier',accuracy_score(y_test,y_pred_sgd),
                    precision_score(y_test, y_pred_sgd, average = 'macro'), recall_score(y_test, y_pred_sgd, average = 'macro'),
                    f1_score(y_test, y_pred_sgd, average = 'macro'), train_time_sgd])

### RandomForest

In [18]:
rf = RandomForestClassifier()

start_time = time.time()
rf.fit(X_train, y_train)
train_time_rf = time.time() - start_time
y_pred_rf = sgd.predict(X_test)

In [19]:
entries.append(['Random Forest',accuracy_score(y_test,y_pred_rf),
                    precision_score(y_test, y_pred_rf, average = 'macro'), recall_score(y_test, y_pred_rf, average = 'macro'),
                    f1_score(y_test, y_pred_rf, average = 'macro'), train_time_rf])

### Compare

In [20]:
cv_df = pd.DataFrame(entries, 
                     columns=['Model','Accuracy', 'Precision', 'Recall', 'F1', 'Time'])

In [21]:
cv_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,Time
0,Naive Bayes,0.701119,0.666504,0.677984,0.667027,0.029273
1,Logistic Regression,0.731402,0.694996,0.707229,0.697152,23.916062
2,Decision Tree,0.75993,0.726429,0.734116,0.729724,34.204867
3,SGD Classifier,0.7088,0.661936,0.666888,0.663766,0.678718
4,Random Forest,0.7088,0.661936,0.666888,0.663766,227.628069
