<a href="https://colab.research.google.com/github/toan01-uet/sentiment/blob/main/stacking_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier


In [2]:
data = pd.read_csv("/content/drive/MyDrive/DataScience/clean_train_data.csv",usecols = ['comment','label'])
data.head()

Unnamed: 0,comment,label
0,dung dc sp tot cam on shop đóng_gói sản_phẩm đ...,0
1,chất_lượng sản_phẩm tuyệt_vời son mịn đánh màu...,0
2,chất_lượng sản_phẩm tuyệt_vời k hộp k dây giày...,0
3,hơi thất_vọng chút kỳ_vọng sách hi_vọng học_tậ...,1
4,mua áo_gió màu hồng ok đợt giao áo_gió chất vả...,1


In [3]:
data['comment'][0]

'dung dc sp tot cam on shop đóng_gói sản_phẩm đẹp chất_lượng sản_phẩm tuyệt_vời'

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16087 entries, 0 to 16086
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  16049 non-null  object
 1   label    16087 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 251.5+ KB


In [5]:
data = data.dropna()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16049 entries, 0 to 16086
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  16049 non-null  object
 1   label    16049 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 376.1+ KB


In [6]:
data["label"].value_counts()

0    9253
1    6796
Name: label, dtype: int64

In [7]:
print("Maximum review length: {}".format(len(max((data['comment'].values), key=len))))
print("Minimum review length: {}".format(len(min((data['comment'].values), key=len))))
result = [len(x) for x in data['comment'].values]
print("Mean review length: {}".format(np.mean(result)))

Maximum review length: 1392
Minimum review length: 1
Mean review length: 53.209919621160196


In [8]:
from sklearn.decomposition import TruncatedSVD
## TF-IDF + SVD
clf = Pipeline([
                ('tfidf', TfidfVectorizer(max_df=0.3, min_df=10, ngram_range=(1, 2), norm="l2")),
                ('svd', TruncatedSVD(n_components = 500, random_state=42)),
                ])

In [9]:
# Sample data - 25% of data to test set
train, test = train_test_split(data, random_state=1, test_size=0.25, shuffle=True)

X_train = train["comment"]
Y_train = train["label"]
X_test = test["comment"]
Y_test = test["label"]
print(X_train.shape)
print(X_test.shape)

(12036,)
(4013,)


In [10]:
# transform each sentence to numeric vector with tf-idf value as elements
X_train_vec = clf.fit_transform(X_train)
X_test_vec = clf.transform(X_test)


In [17]:
Y_train = Y_train.values
Y_test = Y_test.values

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import Perceptron

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import recall_score, precision_score,f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix


In [13]:
from sklearn.model_selection import StratifiedKFold
ntrain = X_train_vec.shape[0]
ntest = X_test_vec.shape[0]
SEED = 42 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = StratifiedKFold( n_splits= NFOLDS, random_state=SEED)

def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.fit(x_tr, y_tr)

        oof_train[test_index] = clf.predict_proba(x_te)[:,1]
        oof_test_skf[i, :] = clf.predict_proba(x_test)[:,1]

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)



**First-Level Models**

In [18]:
lr = LogisticRegression(random_state= SEED)
dtc = DecisionTreeClassifier(random_state= SEED)
rfc = RandomForestClassifier(random_state= SEED)
gau_nb = GaussianNB()
ber_nb = BernoulliNB()
# Create our OOF train and test predictions. These base results will be used as new features
lr_oof_train, lr_oof_test = get_oof(lr, X_train_vec, Y_train, X_test_vec) # LogisticRegression
dtc_oof_train, dtc_oof_test = get_oof(dtc, X_train_vec, Y_train, X_test_vec) # DecisionTreeClassifier
rfc_oof_train, rfc_oof_test = get_oof(rfc, X_train_vec, Y_train, X_test_vec) # RandomForestClassifier
gau_oof_train, gau_oof_test = get_oof(gau_nb, X_train_vec, Y_train, X_test_vec) # GaussianNB
ber_oof_train, ber_oof_test = get_oof(ber_nb, X_train_vec, Y_train, X_test_vec) # BernoulliNB
print("Training is complete")

Training is complete


In [19]:
base_predictions_train = pd.DataFrame({
    'LogisticRegression': lr_oof_train.ravel(),
     'DecisionTree': dtc_oof_train.ravel(),
     ' RandomForest': rfc_oof_train.ravel(),
     'GaussianNB': gau_oof_train.ravel(),
      'BernoulliNB': ber_oof_train.ravel()
    })
base_predictions_train.head()

Unnamed: 0,LogisticRegression,DecisionTree,RandomForest,GaussianNB,BernoulliNB
0,0.091472,0.0,0.18,2.227288e-11,0.221683
1,0.937045,1.0,0.8,8.683265e-12,0.910055
2,0.582718,0.0,0.53,1.0,0.558867
3,0.911952,1.0,0.671,0.9997975,0.625036
4,0.505166,0.0,0.41125,0.0006240134,0.529703


In [23]:
base_predictions_train.describe()

Unnamed: 0,LogisticRegression,DecisionTree,RandomForest,GaussianNB,BernoulliNB
count,12036.0,12036.0,12036.0,12036.0,12036.0
mean,0.421124,0.411245,0.421098,0.4621306,0.421665
std,0.345069,0.484155,0.246983,0.4881231,0.334384
min,0.00016,0.0,0.0,2.279948e-74,1.6e-05
25%,0.063126,0.0,0.2,1.260852e-13,0.093887
50%,0.405787,0.0,0.429904,0.02159354,0.367153
75%,0.748824,1.0,0.63,1.0,0.74424
max,0.999394,1.0,1.0,1.0,0.995713


****Second-Level Models (Meta models: logisticRe)****

In [24]:
x_train = np.concatenate(( lr_oof_train, dtc_oof_train, rfc_oof_train, gau_oof_train, ber_oof_train), axis=1)
x_test = np.concatenate(( lr_oof_test, dtc_oof_test, rfc_oof_test, gau_oof_test, ber_oof_test), axis=1)

In [25]:
lr = LogisticRegression(random_state= SEED)
lr.fit(x_train,Y_train)
pred = lr.predict(x_test)


In [28]:
import sklearn
print(sklearn.metrics.classification_report(Y_test, pred))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88      2287
           1       0.84      0.86      0.85      1726

    accuracy                           0.87      4013
   macro avg       0.87      0.87      0.87      4013
weighted avg       0.87      0.87      0.87      4013

