# Classification outputs

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import seaborn as sns
from scipy.stats import skew, boxcox
from scipy import sparse
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import time
import matplotlib.pyplot as plt
%matplotlib inline

Load probability data 

In [2]:
df_prob = pd.read_csv('../data/xgb_prob_opt1.csv')
df = pd.read_csv('../data/input_proc2.csv') # with original label
df = pd.concat([df_prob, df.SPENDINGRESPONSE], axis=1)

In [3]:
train_size = 20000
df.SPENDINGRESPONSE[train_size:].isnull().sum()

9231

Comput accuracy to find the threshold of predicted probablity.  

In [4]:
Y = df.SPENDINGRESPONSE[:train_size]
X = df.Probability[:train_size]

In [5]:
X.describe()

count    20000.000000
mean         0.310144
std          0.022279
min          0.233361
25%          0.294703
50%          0.309643
75%          0.324759
max          0.400382
Name: Probability, dtype: float64

Split dataset to train and test. Use train dataset to find the threshold of predicted probablity, and test the results on the test dataset.

In [6]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1234)
x_train_sort = sorted(x_train, reverse=True)

In [7]:
count=0
for i in y_train:
    if i==1:
        count +=1
threh = x_train_sort[count]
threh

0.32072246

In [8]:
y_pred = [1 if x > threh else 0 for x in x_test]

Performance measures

In [9]:
print(classification_report(y_pred, y_test))

             precision    recall  f1-score   support

          0       0.74      0.75      0.74      2740
          1       0.44      0.43      0.43      1260

avg / total       0.64      0.65      0.65      4000



In [10]:
print(accuracy_score(y_pred, y_test))

0.6475


In [11]:
print(confusion_matrix(y_pred, y_test))

[[2054  686]
 [ 724  536]]


Generate final submission.

In [13]:
y_class_full = [1 if x > threh else 0 for x in df.Probability]
len(y_class_full)

29231

In [14]:
y_class_full_word = ["Spend to Improve Economy" if x==1 else "Reduce National Debt and Deficit" for x in y_class_full]

In [15]:
fnl_sub_df = pd.DataFrame({'ID': df.ID, 'Probability': df.Probability, 'Classification_1_0': y_class_full, 'Classification': y_class_full_word})
fnl_sub_df = fnl_sub_df[['ID','Probability','Classification_1_0','Classification']]
fnl_sub_df.to_csv("../data/final_submission.csv", index=False)