### Stacking_Ensemble for multi-label classification

In this notebook, We build a stacking ensemble model, with Random Forest, SVM, Naive Bayes, XGBoost, Logistic Regression
and K-Nearest Neighbor as weak learners, and use Logistic Regression as the stage-2 model. Achieved 69.92% on accuracy and 0.67 f1-score.

In [2]:
import pandas as pd
import pickle as pkl
import numpy as np

pd.set_option('display.max_columns', None)

In [3]:
train_data = pd.read_csv("train.tsv", delimiter='\t')
val_data = pd.read_csv("val.tsv", delimiter='\t')
test_data = pd.read_csv("test.tsv", delimiter='\t')

In [4]:
train_data.head()

Unnamed: 0,title,text,titletext,label
0,Ongoing dry nose and yellow boogers since arou...,[removed],Ongoing dry nose and yellow boogers since arou...,0
1,Requesting help,"I started feeling ill June 19, I got tested 2 ...","Requesting help I started feeling ill June 19,...",2
2,What do you think of my moms symptoms. Should ...,So a week ago my mom vomited twice unexplainab...,What do you think of my moms symptoms. Should ...,0
3,How should I prepare as a single person,Hi All. I’m single and live alone. All my fami...,How should I prepare as a single person Hi All...,0
4,How long were yall testing positive ?,Im at day 25 pretty much asymptomatic . Only t...,How long were yall testing positive ? Im at da...,2


In [5]:
from string import punctuation

def preprocess(seq):
    seq = ''.join([ch for ch in seq if ch not in punctuation])
    seq.replace('\n', ' ')
    return seq

for data in [train_data, val_data, test_data]:
    data['clean_text'] = data['titletext'].astype(str).apply(preprocess)

train_data = train_data[['clean_text', 'label']]
val_data = val_data[['clean_text', 'label']]
test_data = test_data[['clean_text', 'label']]

In [6]:
X_train, X_val, X_test = train_data['clean_text'], val_data['clean_text'], test_data['clean_text']
y_train, y_val, y_test = train_data['label'], val_data['label'], test_data['label']

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

X_train = tfidf_vectorizer.fit_transform(X_train)
X_val = tfidf_vectorizer.transform(X_val)
X_test = tfidf_vectorizer.transform(X_test)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import xgboost as xgb
from sklearn.model_selection import KFold

In [22]:
estimators = [
    ('rf', RandomForestClassifier(n_estimators=300, class_weight='balanced')),
    ('svm', SVC(class_weight='balanced')), 
    ('NB', MultinomialNB()),
    ('xgb', xgb.XGBClassifier(objective='multi:softmax', n_estimators=200, max_depth=10)),
    ('lr', LogisticRegression(class_weight='balanced', solver='sag', max_iter=1000)),
    ('knn', KNeighborsClassifier(n_neighbors=10))
]

clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(solver = 'sag')
)

clf.fit(X_train, y_train)

StackingClassifier(estimators=[('rf',
                                RandomForestClassifier(class_weight='balanced',
                                                       n_estimators=300)),
                               ('svm', SVC(class_weight='balanced')),
                               ('NB', MultinomialNB()),
                               ('xgb',
                                XGBClassifier(base_score=None, booster=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None, gamma=None,
                                              gpu_id=None,
                                              importance_type='gain',
                                              interaction_constraints=...
                                              num_parallel_tree=None,
                                              objective='multi:softmax',

In [24]:
y_hat = clf.predict(X_val)
accuracy_score(y_hat, y_val)

0.6850848681834597

In [27]:
import pickle as pkl
f = open("stacking_ensemble", 'wb')
pkl.dump(clf, f)

In [10]:
f = open("stacking_ensemble", 'rb')
clf = pkl.load(f)

In [11]:
y_pred = clf.predict(X_test)
accuracy_score(y_pred, y_test)

0.6991552956465237

In [21]:
f1_score(y_test, y_pred, average='weighted')

0.6758429605615863