In [29]:
import pandas as pd
import numpy as np

In [30]:
df = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/tax_payers/train_set_label.csv")
df.drop(columns='Unnamed: 0',inplace=True)

In [31]:
df.head()

Unnamed: 0,HHI,HHDL,Married,CollegGrads,AHHAge,Cars,Filed in 2017,Filed in 2016,Filed in 2015,PoliticalParty
0,183414,-12402,1,2,28,1,0,0,1,Democrat
1,129786,700251,2,5,63,3,0,0,0,Republican
2,268343,493411,2,4,44,1,1,0,1,Independent
3,290506,-433408,1,1,28,5,1,0,1,Republican
4,90108,907135,1,3,57,5,1,1,0,Democrat


In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder,StandardScaler

In [33]:
df['Filed in 2015'].unique()

array([1, 0], dtype=int64)

In [34]:
sc = StandardScaler()
le = LabelEncoder()

In [35]:
numeric_columns = ['HHI','HHDL','AHHAge']

In [36]:
df[numeric_columns] = sc.fit_transform(df[numeric_columns])
df.head()

Unnamed: 0,HHI,HHDL,Married,CollegGrads,AHHAge,Cars,Filed in 2017,Filed in 2016,Filed in 2015,PoliticalParty
0,0.322956,0.004958,1,2,-1.299476,1,0,0,1,Democrat
1,-0.295347,1.20307,2,5,0.097417,3,0,0,0,Republican
2,1.302144,0.855331,2,4,-0.660896,1,1,0,1,Independent
3,1.557672,-0.702837,1,1,-1.299476,5,1,0,1,Republican
4,-0.752815,1.550883,1,3,-0.14205,5,1,1,0,Democrat


In [37]:
df['PoliticalParty'] = le.fit_transform(df['PoliticalParty'])
df.head()

Unnamed: 0,HHI,HHDL,Married,CollegGrads,AHHAge,Cars,Filed in 2017,Filed in 2016,Filed in 2015,PoliticalParty
0,0.322956,0.004958,1,2,-1.299476,1,0,0,1,0
1,-0.295347,1.20307,2,5,0.097417,3,0,0,0,2
2,1.302144,0.855331,2,4,-0.660896,1,1,0,1,1
3,1.557672,-0.702837,1,1,-1.299476,5,1,0,1,2
4,-0.752815,1.550883,1,3,-0.14205,5,1,1,0,0


In [38]:
cat_columns = ['Married','CollegGrads','Cars']

In [39]:
processed_df = pd.get_dummies(df,columns=cat_columns,drop_first=True)

In [40]:
processed_df

Unnamed: 0,HHI,HHDL,AHHAge,Filed in 2017,Filed in 2016,Filed in 2015,PoliticalParty,Married_1,Married_2,CollegGrads_1,CollegGrads_2,CollegGrads_3,CollegGrads_4,CollegGrads_5,Cars_1,Cars_2,Cars_3,Cars_4,Cars_5
0,0.322956,0.004958,-1.299476,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0
1,-0.295347,1.203070,0.097417,0,0,0,2,0,1,0,0,0,0,1,0,0,1,0,0
2,1.302144,0.855331,-0.660896,1,0,1,1,0,1,0,0,0,1,0,1,0,0,0,0
3,1.557672,-0.702837,-1.299476,1,0,1,2,1,0,1,0,0,0,0,0,0,0,0,1
4,-0.752815,1.550883,-0.142050,1,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
798,-0.024197,1.659781,1.534222,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
799,-0.216186,-0.284568,-1.179742,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0
800,-1.202949,-1.487711,0.257062,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0
801,-0.365274,-0.060686,-1.499032,0,1,1,2,0,1,0,0,0,0,0,0,0,0,1,0


In [41]:
y = processed_df['PoliticalParty'].values
x = processed_df.drop(columns='PoliticalParty').values

In [42]:
test_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/tax_payers/test_set_label.csv')

In [43]:
test_data

Unnamed: 0.1,Unnamed: 0,HHI,HHDL,Married,CollegGrads,AHHAge,Cars,Filed in 2017,Filed in 2016,Filed in 2015
0,17,253274,-633752,2,1,94,3,0,1,1
1,590,47107,322850,0,1,30,2,1,1,0
2,224,111874,300583,0,2,46,3,0,1,1
3,960,96670,107419,0,3,77,5,0,0,0
4,57,128669,341273,0,0,92,4,1,1,1
...,...,...,...,...,...,...,...,...,...,...
196,51,138204,144986,1,1,63,1,1,1,0
197,705,73250,-810927,0,5,40,1,1,0,1
198,542,134376,352116,0,2,86,1,1,0,0
199,929,70820,260337,0,3,78,3,1,0,1


In [44]:
test_data.drop(columns='Unnamed: 0',inplace=True)
test_data[numeric_columns] = sc.transform(test_data[numeric_columns])
processed_test_data = pd.get_dummies(test_data,columns=cat_columns,drop_first=True)
processed_test_data

Unnamed: 0,HHI,HHDL,AHHAge,Filed in 2017,Filed in 2016,Filed in 2015,Married_1,Married_2,CollegGrads_1,CollegGrads_2,CollegGrads_3,CollegGrads_4,CollegGrads_5,Cars_1,Cars_2,Cars_3,Cars_4,Cars_5
0,1.128406,-1.039655,1.334665,0,1,1,0,1,1,0,0,0,0,0,0,1,0,0
1,-1.248594,0.568584,-1.219653,1,1,0,0,0,1,0,0,0,0,0,1,0,0,0
2,-0.501864,0.531148,-0.581074,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0
3,-0.677158,0.206401,0.656174,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
4,-0.308226,0.599556,1.254843,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,-0.198292,0.269559,0.097417,1,1,0,1,0,1,0,0,0,0,1,0,0,0,0
197,-0.947179,-1.337521,-0.820541,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0
198,-0.242427,0.617786,1.015376,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0
199,-0.975195,0.463487,0.696086,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0


In [45]:
xtest = processed_test_data.values
xtest

array([[ 1.12840642, -1.03965471,  1.33466542, ...,  1.        ,
         0.        ,  0.        ],
       [-1.24859419,  0.5685836 , -1.21965347, ...,  0.        ,
         0.        ,  0.        ],
       [-0.50186363,  0.53114835, -0.58107375, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-0.24242702,  0.61778557,  1.01537556, ...,  0.        ,
         0.        ,  0.        ],
       [-0.97519536,  0.46348681,  0.6960857 , ...,  1.        ,
         0.        ,  0.        ],
       [-0.47147189,  1.59047183, -0.90036361, ...,  0.        ,
         0.        ,  0.        ]])

In [46]:
rf = RandomForestClassifier()
rf.fit(x,y)
pred_rf = rf.predict(xtest)

In [47]:
pred_rf

array([1, 0, 2, 1, 2, 0, 1, 2, 1, 0, 1, 1, 1, 2, 0, 2, 1, 2, 2, 0, 1, 2,
       2, 0, 2, 0, 0, 0, 2, 0, 0, 1, 0, 1, 2, 2, 2, 1, 1, 2, 2, 2, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 2, 2, 2, 2, 2, 0, 0, 0, 1, 0, 2, 0, 1, 0,
       2, 1, 2, 0, 0, 0, 2, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 2, 0, 2, 1, 0,
       0, 2, 1, 1, 2, 1, 0, 2, 0, 1, 0, 0, 0, 2, 2, 2, 1, 2, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 2, 0, 0, 0, 2, 2, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 2, 0, 0, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 2, 2, 0, 0, 2, 0, 2, 1, 2, 1, 0, 1,
       0, 0, 1, 2, 0, 0, 2, 1, 1, 0, 2, 2, 0, 0, 1, 2, 2, 0, 0, 2, 2, 0,
       1, 1, 1])

In [48]:
svc = SVC()
svc.fit(x,y)
pred_svc = svc.predict(xtest)

In [49]:
pred_svc

array([1, 0, 0, 0, 2, 0, 1, 2, 2, 0, 1, 2, 1, 2, 0, 2, 0, 0, 0, 2, 0, 2,
       0, 0, 2, 1, 0, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 2, 2, 2, 0, 2, 1, 1, 0, 1, 0, 2, 2, 1, 0,
       2, 1, 2, 0, 2, 2, 2, 1, 0, 1, 0, 1, 0, 1, 2, 0, 2, 1, 0, 2, 1, 0,
       0, 2, 2, 0, 2, 0, 1, 0, 0, 1, 0, 2, 1, 2, 2, 1, 1, 0, 1, 1, 0, 0,
       2, 0, 1, 0, 2, 1, 0, 1, 2, 2, 2, 0, 0, 1, 2, 2, 2, 2, 1, 1, 0, 1,
       2, 1, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 2, 0, 0, 2, 2, 0, 2, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 2, 0, 0, 2, 0, 2, 1, 2, 1, 2, 0, 1, 2, 2, 0, 2, 2, 0, 1,
       2, 2, 1])

In [50]:
pred_svc_df = pd.DataFrame(pred_svc)
pred_svc_df.columns= ['prediction']
pred_svc_df

Unnamed: 0,prediction
0,1
1,0
2,0
3,0
4,2
...,...
196,0
197,1
198,2
199,2


In [51]:
pred_svc_df.to_csv('predictions.csv',index=False)

In [52]:
pd.read_csv('predictions.csv').head()

Unnamed: 0,prediction
0,1
1,0
2,0
3,0
4,2


In [53]:
pred_rf_df = pd.DataFrame(pred_rf)
pred_rf_df.columns= ['prediction']
pred_svc_df.to_csv('predictions2.csv',index=False)