# OUR TREM NAME

## Project: The Advertising Firm
![](https://upload.wikimedia.org/wikipedia/en/7/7b/Don_Draper_Wiki.jpg)

We are an ad agency working on personalizing ads, so that you aren't bothered by ads you don’t care about, and only see stuff you may be interested in. 

We are promoting a product that people with income <50k are not interested in. 
**We want to market to people with income greater then $50,000**

We start with census data (94-95) on demographics, age, education, country, occupation, income, etc.
The data was split into train/test in approximately 2/3, 1/3 split.

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
import data_prep
ROOT_DIR = os.path.dirname('data/')

train_file_path = os.path.abspath(os.path.join(ROOT_DIR, 'census-income.csv'))
data1=data_prep.load_file(train_file_path)
train_df=data_prep.clean_data(data1)

test_file_path = os.path.abspath(os.path.join(ROOT_DIR, 'census-income-test.csv'))
data2=data_prep.load_file(test_file_path)
test_df=data_prep.clean_data(data2)

In [6]:
train_df.columns

Index([u'target', u'detailed_hsld_smry_1', u'region_prev_res_1',
       u'marital_status_1', u'marital_status_2', u'worker_class_1',
       u'worker_class_2', u'worker_class_3', u'age', u'wage', u'dividends',
       u'no_persons_worked_for_employer', u'weeks_worked_in_yr',
       u'fill_inc_question_veteran_admin_1', u'sex_1', u'enrolled_last_wk_1',
       u'citizenship_1', u'employment_status_1', u'occupation_1',
       u'hispanic_origin_1', u'tax_filer_1', u'tax_filer_2', u'year_1',
       u'veteran_benefit_1', u'member_labour_union_1', u'race_1',
       u'unemployment_reason_1', u'family_members_under18_1',
       u'same_house_prev_yr_1', u'capital_gains_1', u'education_1',
       u'education_2', u'migration_1', u'detailed_occ_cd_A',
       u'detailed_occ_cd_B', u'detailed_occ_cd_C', u'detailed_occ_cd_D',
       u'detailed_ind_cd_A', u'detailed_ind_cd_B', u'detailed_ind_cd_C',
       u'detailed_ind_cd_D'],
      dtype='object')

In [7]:
test_df.head(3)

Unnamed: 0,target,detailed_hsld_smry_1,region_prev_res_1,marital_status_1,marital_status_2,worker_class_1,worker_class_2,worker_class_3,age,wage,...,education_2,migration_1,detailed_occ_cd_A,detailed_occ_cd_B,detailed_occ_cd_C,detailed_occ_cd_D,detailed_ind_cd_A,detailed_ind_cd_B,detailed_ind_cd_C,detailed_ind_cd_D
0,0,0,0,0,0,0,0,0,44,0,...,0,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0,0,0,0,0,0,0,0,2,0,...,0,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0,0,0,0,0,0,0,0,35,0,...,0,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## KNN classifier

In [20]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
def knn_classifier(X_train,Y_train,X_test,Y_test):
    k=10
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(X_train,np.ravel(Y_train))
    print "KNN"
    #acc=acc.append(pd.DataFrame([[k,knn_model.score(X_test,np.ravel(Y_test)),np.nan]],columns=['k','knn_acc','logit_acc']))
    print "K="+str(k)+ " " + "Accuracy="+str(knn_model.score(X_test,np.ravel(Y_test)))
    return knn_model

# Decision tree Classifier

In [21]:
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

In [22]:
def dt_classifier(X_train,Y_train,X_test,Y_test):
    clf = DecisionTreeClassifier(max_depth=None, min_samples_split=1,random_state=0)
    clf.fit(X_train,Y_train)
    scores=clf.score(X_test,Y_test)
    print "Decision tree"
    print scores
    return clf

## Random Forest Classifier

In [23]:
def rf_classifier(X_train,Y_train,X_test,Y_test):
    clf = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=1, random_state=0)
    clf.fit(X_train,Y_train)
    scores = clf.score(X_test, Y_test)
    print "Random Forest"
    print scores
    return clf

## Extra tree classifier

In [24]:
def et_classifier(X_train,Y_train,X_test,Y_test):
    clf = ExtraTreesClassifier(n_estimators=10, max_depth=None,min_samples_split=1, random_state=0)
    clf.fit(X_train,Y_train)
    scores = clf.score(X_test,Y_test)
    print "Extra tree"
    print scores
    return clf

### Without Down-sample

In [25]:
train_df_y=train_df['target']
train_df_x=train_df.drop(['target'],axis=1,inplace=False)
test_df_y=test_df['target']
test_df_x=test_df.drop(['target'],axis=1,inplace=False)
master_test_df=test_df_x[:]
master_test_df['target']=test_df_y
model_knn=knn_classifier(train_df_x,train_df_y,test_df_x,test_df_y)
model_dt=dt_classifier(train_df_x,train_df_y,test_df_x,test_df_y)
model_rf=rf_classifier(train_df_x,train_df_y,test_df_x,test_df_y)
model_et=et_classifier(train_df_x,train_df_y,test_df_x,test_df_y)
master_test_df['target_pred_knn']=model_knn.predict(test_df_x)
master_test_df['target_pred_rf']=model_rf.predict(test_df_x)
master_test_df['target_pred_dt']=model_dt.predict(test_df_x)
master_test_df['target_pred_et']=model_et.predict(test_df_x)
#test_df_x['target_pred']=model.predict(test_df_x)
#test_df_x['target']=test_df_y
#print test_df_x.groupby(['target','target_pred'])['target'].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


KNN
K=10 Accuracy=0.946368251785
Decision tree
0.935658083699
Random Forest
0.942211614313
Extra tree
0.941503671191


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [35]:
print "KNN"
print master_test_df.groupby(['target','target_pred_knn'])['target'].count()
print "DT"
print master_test_df.groupby(['target','target_pred_dt'])['target'].count()
print "RF"
print master_test_df.groupby(['target','target_pred_rf'])['target'].count()
print "ET"
print master_test_df.groupby(['target','target_pred_et'])['target'].count()
print "Logistic"
print master_test_df.groupby(['target','target_pred_logistic'])['target'].count()

KNN
target  target_pred_knn
0       0                  91666
        1                   1026
1       0                   4277
        1                   1909
Name: target, dtype: int64
DT
target  target_pred_dt
0       0                 90076
        1                  2616
1       0                  3746
        1                  2440
Name: target, dtype: int64
RF
target  target_pred_rf
0       0                 90833
        1                  1859
1       0                  3855
        1                  2331
Name: target, dtype: int64
ET
target  target_pred_et
0       0                 90886
        1                  1806
1       0                  3978
        1                  2208
Name: target, dtype: int64
Logistic
target  target_pred_logistic
0       0                       91963
        1                         729
1       0                        4691
        1                        1495
Name: target, dtype: int64


In [34]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

train_df_y=train_df['target']
train_df_x=train_df.drop(['target'],axis=1,inplace=False)
test_df_y=test_df['target']
test_df_x=test_df.drop(['target'],axis=1,inplace=False)

model=sklearn.linear_model.LogisticRegression()
model.fit(train_df_x,train_df_y)
master_test_df['target_pred_logistic']=model.predict(test_df_x)
accuracy_logistic=accuracy_score(test_df_y, master_test_df['target_pred_logistic'])
print accuracy_logistic

0.945184975424


### Down-sampling

In [28]:
train_df_x=train_df.loc[train_df['target']==1,:]
rows=np.random.choice(train_df.loc[train_df['target']==0,:].index.values,4*train_df_x.shape[0],replace=False)
train_df_x=train_df_x.append(train_df.ix[set(rows)])
train_df_y=train_df_x['target']
train_df_x=train_df_x.drop(['target'],axis=1,inplace=False)
test_df_y=test_df['target']
test_df_x=test_df.drop(['target'],axis=1,inplace=False)
master_test_df_ds=test_df_x[:]
master_test_df_ds['target']=test_df_y
model_knn=knn_classifier(train_df_x,train_df_y,test_df_x,test_df_y)
model_dt=dt_classifier(train_df_x,train_df_y,test_df_x,test_df_y)
model_rf=rf_classifier(train_df_x,train_df_y,test_df_x,test_df_y)
model_et=et_classifier(train_df_x,train_df_y,test_df_x,test_df_y)
master_test_df_ds['target_pred_knn']=model_knn.predict(test_df_x)
master_test_df_ds['target_pred_rf']=model_rf.predict(test_df_x)
master_test_df_ds['target_pred_dt']=model_dt.predict(test_df_x)
master_test_df_ds['target_pred_et']=model_et.predict(test_df_x)
#model=knn_classifier(train_df_x,train_df_y,test_df_x,test_df_y)
#test_df_x['target_pred']=model.predict(test_df_x)
#test_df_x['target']=test_df_y
#print test_df_x.groupby(['target','target_pred'])['target'].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


KNN
K=10 Accuracy=0.902010558466
Decision tree
0.893161269443
Random Forest
0.906521167499
Extra tree
0.907330245353


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [37]:
print "KNN"
print master_test_df_ds.groupby(['target','target_pred_knn'])['target'].count()
print "DT"
print master_test_df_ds.groupby(['target','target_pred_dt'])['target'].count()
print "RF"
print master_test_df_ds.groupby(['target','target_pred_rf'])['target'].count()
print "ET"
print master_test_df_ds.groupby(['target','target_pred_et'])['target'].count()
print "Logistic"
print master_test_df_ds.groupby(['target','target_pred_logistic'])['target'].count()

KNN
target  target_pred_knn
0       0                  84871
        1                   7821
1       0                   1868
        1                   4318
Name: target, dtype: int64
DT
target  target_pred_dt
0       0                 84469
        1                  8223
1       0                  2341
        1                  3845
Name: target, dtype: int64
RF
target  target_pred_rf
0       0                 85654
        1                  7038
1       0                  2205
        1                  3981
Name: target, dtype: int64
ET
target  target_pred_et
0       0                 85938
        1                  6754
1       0                  2409
        1                  3777
Name: target, dtype: int64
Logistic
target  target_pred_logistic
0       0                       86415
        1                        6277
1       0                        2215
        1                        3971
Name: target, dtype: int64


In [30]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
model=sklearn.linear_model.LogisticRegression()
model.fit(train_df_x,train_df_y)
master_test_df_ds['target_pred_logistic']=model.predict(test_df_x)
accuracy_logistic=accuracy_score(test_df_y, master_test_df_ds['target_pred_logistic'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [32]:
accuracy_logistic

0.91411638584922839

In [33]:
master_test_df_ds.head(10)

Unnamed: 0,detailed_hsld_smry_1,region_prev_res_1,marital_status_1,marital_status_2,worker_class_1,worker_class_2,worker_class_3,age,wage,dividends,...,detailed_ind_cd_A,detailed_ind_cd_B,detailed_ind_cd_C,detailed_ind_cd_D,target,target_pred_knn,target_pred_rf,target_pred_dt,target_pred_et,target_pred_logistic
0,0,0,0,0,0,0,0,44,0,2500,...,0.0,1.0,0.0,0.0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,2,0,0,...,0.0,0.0,0.0,1.0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,35,0,0,...,0.0,1.0,0.0,0.0,0,1,0,0,0,1
3,0,0,0,0,0,0,0,49,0,0,...,0.0,0.0,1.0,0.0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,13,0,0,...,0.0,0.0,0.0,1.0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,1.0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,61,0,0,...,0.0,0.0,0.0,1.0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,38,0,0,...,1.0,0.0,0.0,0.0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,7,0,0,...,0.0,0.0,0.0,1.0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,30,0,1000,...,1.0,0.0,0.0,0.0,0,1,0,1,0,0


In [38]:
master_test_df['sum_pred']=master_test_df['target_pred_dt']+master_test_df['target_pred_et']+master_test_df['target_pred_rf']+master_test_df['target_pred_knn']+master_test_df['target_pred_logistic']
master_test_df_ds['sum_pred']=master_test_df_ds['target_pred_dt']+master_test_df_ds['target_pred_et']+master_test_df_ds['target_pred_rf']+master_test_df_ds['target_pred_knn']+master_test_df_ds['target_pred_logistic']

In [39]:
master_test_df['final_pred']=0
master_test_df.loc[master_test_df['sum_pred']>2,'final_pred']=1
master_test_df_ds['final_pred']=0
master_test_df_ds.loc[master_test_df_ds['sum_pred']>2,'final_pred']=1

In [40]:
print "Without Down Sample"
print master_test_df.groupby(['target','final_pred'])['target'].count()
print "Down Sample"
print master_test_df_ds.groupby(['target','final_pred'])['target'].count()

Without Down Sample
target  final_pred
0       0             91413
        1              1279
1       0              4079
        1              2107
Name: target, dtype: int64
Down Sample
target  final_pred
0       0             86371
        1              6321
1       0              2152
        1              4034
Name: target, dtype: int64


In [43]:
1.0*2107/(2107+4079)

0.34060782411897833

In [None]:
1.0*1279/()

In [44]:
1.0*4034/(4034+2152)

0.6521176850953767

In [51]:
all_pred=pd.DataFrame(master_test_df_ds.groupby(['target_pred_rf','target_pred_knn','target_pred_et','target_pred_dt','target_pred_logistic'])['target'].mean()).reset_index()

In [57]:
all_pred['sum']=all_pred['target_pred_dt']+all_pred['target_pred_rf']+all_pred['target_pred_et']+all_pred['target_pred_knn']+all_pred['target_pred_logistic']

In [55]:
all_pred.sort(columns=['target'],inplace=True,ascending=False)

  if __name__ == '__main__':


In [58]:
all_pred

Unnamed: 0,target_pred_rf,target_pred_knn,target_pred_et,target_pred_dt,target_pred_logistic,target,sum
31,1,1,1,1,1,0.521578,5
29,1,1,1,0,1,0.472222,4
27,1,1,0,1,1,0.453782,4
11,0,1,0,1,1,0.416667,3
15,0,1,1,1,1,0.374046,4
13,0,1,1,0,1,0.348624,3
23,1,0,1,1,1,0.332834,4
26,1,1,0,1,0,0.310924,3
25,1,1,0,0,1,0.307278,3
9,0,1,0,0,1,0.289144,2


In [59]:
all_pred=pd.DataFrame(master_test_df_ds.groupby(['sum_pred'])['target'].mean()).reset_index()

In [60]:
all_pred

Unnamed: 0,sum_pred,target
0,0,0.012446
1,1,0.090708
2,2,0.193727
3,3,0.212911
4,4,0.329724
5,5,0.521578
