# Stacking

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score,precision_score,recall_score,f1_score

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

import scipy.stats as stats

from sklearn.model_selection import GridSearchCV


import warnings
warnings.filterwarnings('ignore')

## Stacking

In [None]:
# Train Data
data = pd.read_csv("df_train_1.csv")
df = data.copy()
data.shape

X_train = df.drop(columns = 'Overall_Experience')
X_train = pd.get_dummies(X_train, drop_first = True)
X_train.shape

y_train = df['Overall_Experience']
y_train.value_counts()

# Test Data
data_test = pd.read_csv("df_test_1.csv") 
df_test = data_test.copy()
df_test.shape

X_test = data_test
X_test = pd.get_dummies(X_test, drop_first = True)
X_test.shape

y_test_ID = data_test['ID']

#---------------------------------------------
# Decision Tree
#Defining Decision tree model with class weights class_weight={0: 0.5, 1: 0.5}
d_tree =  DecisionTreeClassifier(random_state = 7, class_weight = {0: 0.5, 1: 0.5})
d_tree.fit(X_train, y_train)

y_train_pred = d_tree.predict(X_train)
y_test_pred = d_tree.predict(X_test)

d_tree_train_accuracy = classification_report(y_train, y_train_pred, output_dict=True)['accuracy']
print("Decision Tree train accuracy:", d_tree_train_accuracy)

df_y_test_pred = pd.DataFrame({"Overall_Experience" : y_test_pred})
df_y_test_ID = pd.DataFrame({"ID" : y_test_ID})
res = pd.concat([ df_y_test_ID, df_y_test_pred], axis=1)

res.to_csv("dtree.csv", index=False)


#---------------------------------------------
# Random Forest
# Defining Random forest CLassifier
r_forest = RandomForestClassifier(random_state = 7)
r_forest.fit(X_train,y_train)

#Checking performance on the training data
y_train_pred = r_forest.predict(X_train)
y_test_pred = r_forest.predict(X_test)

r_forest_train_accuracy = classification_report(y_train, y_train_pred, output_dict=True)['accuracy']
print("Random Forest train accuracy:", r_forest_train_accuracy)

df_y_test_pred = pd.DataFrame({"Overall_Experience" : y_test_pred})
df_y_test_ID = pd.DataFrame({"ID" : y_test_ID})
res = pd.concat([ df_y_test_ID, df_y_test_pred], axis=1)
res.to_csv("rforest.csv", index=False)

#---------------------------------------------
# Random Forest Tuned
# Choose the type of classifier. 
r_forest_tuned = RandomForestClassifier(criterion = "entropy", random_state = 1, class_weight={0: 0.5, 1: 0.5})

# Grid of parameters to choose from
# parameters = {
#     "n_estimators": [10, 110, 10],
#     "max_depth": [5, 6, 7],
#     "max_features": ['auto', 'sqrt', 'log2', 'None'],
#     "min_samples_leaf" : np.arange(1, 15, 5),
# #     "min_samples_split": np.arange(2, 20, 5),
#              }


parameters = {  
#     "max_features" : ['sqrt', 'log2', None, .65],
    "max_features": ['auto', 'sqrt', 'log2', 'None'],
    "min_samples_leaf" : np.arange(1, 15, 5),
    "min_samples_split": np.arange(2, 20, 5),
    "n_estimators": np.arange(10, 110, 20),
#     'n_estimators': [400], #  94.93%
             }
              

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(accuracy_score, pos_label = 1)

# Run the grid search
grid_obj = GridSearchCV(r_forest_tuned, parameters, scoring = 'f1', cv = 5, n_jobs=-1)

#fit the GridSearch on train dataset
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
r_forest_tuned = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
r_forest_tuned.fit(X_train, y_train)
y_train_pred = r_forest_tuned.predict(X_train)

r_forest_tuned_train_accuracy = classification_report(y_train, y_train_pred, output_dict=True)['accuracy']
print("Random Forest train accuracy:", r_forest_tuned_train_accuracy)

df_y_test_pred = pd.DataFrame({"Overall_Experience" : y_test_pred})
df_y_test_ID = pd.DataFrame({"ID" : y_test_ID})
res = pd.concat([ df_y_test_ID, df_y_test_pred], axis=1)
res.to_csv("rforest_tuned_17.csv", index=False)

#---------------------------------------------
# XG Boost
import xgboost as xgb
from xgboost import XGBClassifier

data = pd.read_csv("df_train_1.csv")

# Drop the dependent variable from the dataframe and create the X(independent variable) matrix
X = data.drop(columns = 'Overall_Experience')
X = pd.get_dummies(X, drop_first = True)
y = data['Overall_Experience']

# Remove low variance features
from sklearn.feature_selection import VarianceThreshold

selection = VarianceThreshold(threshold = (0.1))
X = selection.fit_transform(X)

X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y,  stratify = y, test_size = 0.2, random_state = 42)

X_train1.shape, X_test1.shape

#Create a XGBoost Regressor
xgboost_1 = XGBClassifier(n_estimators=10000, learning_rate=0.05,)
xgboost_1.fit(X_train1, y_train1, early_stopping_rounds=5, eval_set=[(X_test1, y_test1)], verbose=0)

# Checking performance on the testing data
y_train_pred1 = xgboost_1.predict(X_train1)
y_test_pred1 = xgboost_1.predict(X_test1)

# test
xgboost_1_train_accuracy = classification_report(y_train1, y_train_pred1, output_dict=True)['accuracy']
xgboost_1_test_accuracy = classification_report(y_test1, y_test_pred1, output_dict=True)['accuracy']

print("XGBoost train accuracy:", xgboost_1_train_accuracy)
print("XGBoost test accuracy:", xgboost_1_test_accuracy)

df_y_test_pred = pd.DataFrame({"Overall_Experience" : y_test_pred})
df_y_test_ID = pd.DataFrame({"ID" : y_test_ID})
res = pd.concat([ df_y_test_ID, df_y_test_pred], axis=1)
res.to_csv("xgboost_2.csv", index=False)


#---------------------------------------------
# SVM
#Import svm model
from sklearn.svm import SVC

#Create a svm Classifier
svm_rbf = SVC(C=3.0, kernel='rbf', degree=3, gamma='auto') 
svm_rbf.fit(X_train, y_train)

y_train_pred = svm_rbf.predict(X_train)
y_test_pred = svm_rbf.predict(X_test)

# test
svm_rbf_train_accuracy = classification_report(y_train, y_train_pred, output_dict=True)['accuracy']
print("SVM train accuracy:", svm_rbf_train_accuracy)

df_y_test_pred = pd.DataFrame({"Overall_Experience" : y_test_pred})
df_y_test_ID = pd.DataFrame({"ID" : y_test_ID})
res = pd.concat([ df_y_test_ID, df_y_test_pred], axis=1)
res.to_csv("svm_rbf.csv", index=False)


#---------------------------------------------
# Stacking
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

estimators = [
    ('d_tree', d_tree),
    ('r_forest', r_forest),
    ('r_forest_tuned', r_forest_tuned),   
    ('xgboost_1', xgboost_1),
    ('svm_rbf', svm_rbf),
]

# Build model
stack = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression ())

# Train model
stack.fit(X_train, y_train)

# Checking performance on the testing data
y_train_pred = stack.predict(X_train)
y_test_pred = stack.predict(X_test)

# test
stack_train_accuracy = classification_report(y_train, y_train_pred, output_dict=True)['accuracy']

print("Stack train accuracy:", stack_train_accuracy)
# merging columns and exporting
df_y_test_pred = pd.DataFrame({"Overall_Experience" : y_test_pred})
df_y_test_ID = pd.DataFrame({"ID" : y_test_ID})
res = pd.concat([ df_y_test_ID, df_y_test_pred], axis=1)

res.to_csv("stacking4.csv", index=False)