In [1]:
#!pip install category_encoders

In [2]:
#http://drivendata.co/blog/worldbank-poverty-benchmark/

%matplotlib inline

import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# data directory
DATA_DIR = os.path.join(r'C:\Users\piush\Desktop\Dataset\world_bank_poverty_data\household')

In [3]:
data_paths = {'A': {'train': os.path.join(DATA_DIR,  'A_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR,  'A_hhold_test.csv')}, 
              
              'B': {'train': os.path.join(DATA_DIR,  'B_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR,  'B_hhold_test.csv')}, 
              
              'C': {'train': os.path.join(DATA_DIR,  'C_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR,  'C_hhold_test.csv')}}

In [4]:
# load training data
a_train = pd.read_csv(data_paths['A']['train'], index_col='id')
b_train = pd.read_csv(data_paths['B']['train'], index_col='id')
c_train = pd.read_csv(data_paths['C']['train'], index_col='id')

##### Drop country and poor in the training set

In [5]:
ay_train = np.ravel(a_train.poor)

In [6]:
by_train = np.ravel(b_train.poor)


In [7]:
cy_train = np.ravel(c_train.poor)

###### Convert only the categorical values

In [8]:
import category_encoders as ce

In [9]:
encoder = ce.HashingEncoder(cols= list(a_train.select_dtypes(include=['object']).columns.values))
encoder.fit(a_train, ay_train)
aX_cleaned = encoder.transform(a_train)

In [10]:
encoder = ce.HashingEncoder(cols= list(b_train.select_dtypes(include=['object']).columns.values))
encoder.fit(b_train, by_train)
bX_cleaned = encoder.transform(b_train)

In [11]:
encoder = ce.HashingEncoder(cols= list(c_train.select_dtypes(include=['object']).columns.values))
encoder.fit(c_train, cy_train)
cX_cleaned = encoder.transform(c_train)

In [12]:
#encoder.fit(a_train, ay_train)

In [13]:
#aX_cleaned = encoder.transform(a_train)

In [14]:
aX_cleaned.head(2)

Unnamed: 0_level_0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,nEsgxvAq,OMtioXZZ,YFMZwKrU,poor,TiwRslOh
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
46107,45,43,35,43,46,39,50,39,-16.0,21,-2.0,False,-7
82739,38,46,34,42,47,37,55,41,-10.0,12,-3.0,False,-1


In [16]:
aX_cleaned['poor'].value_counts()

False    4500
True     3703
Name: poor, dtype: int64

In [17]:
# from sklearn.preprocessing import LabelEncoder
# a_train = a_train.apply(LabelEncoder().fit_transform)

In [18]:
# Standardize features
def standardize(df, numeric_only=True):
    numeric = df.select_dtypes(include=['int64', 'float64'])
    
    # subtracy mean and divide by std
    df[numeric.columns] = (numeric - numeric.mean()) / numeric.std()
    
    return df
    

def pre_process_data(df, enforce_cols=None):
    print("Input shape:\t{}".format(df.shape))
        

    df = standardize(df)
    print("After standardization {}".format(df.shape))
        
#     # create dummy variables for categoricals
#     df = pd.get_dummies(df)
#     print("After converting categoricals:\t{}".format(df.shape))
    

    # match test set and training set columns
    if enforce_cols is not None:
        to_drop = np.setdiff1d(df.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, df.columns)

        df.drop(to_drop, axis=1, inplace=True)
        df = df.assign(**{c: 0 for c in to_add})
    
    df.fillna(0, inplace=True)
    
    return df

In [19]:
print("Country A")
aX_train = pre_process_data(aX_cleaned.drop(['poor'], axis=1))


print("\nCountry B")
bX_train = pre_process_data(bX_cleaned.drop(['poor'], axis=1))

print("\nCountry C")
cX_train = pre_process_data(cX_cleaned.drop(['poor'], axis=1))


Country A
Input shape:	(8203, 12)
After standardization (8203, 12)

Country B
Input shape:	(3255, 31)
After standardization (3255, 31)

Country C
Input shape:	(6469, 38)
After standardization (6469, 38)


In [20]:
from sklearn.ensemble import RandomForestClassifier

def train_model(features, labels, **kwargs):
    
    # instantiate model
    model = RandomForestClassifier(n_estimators=50, random_state=0)
    
    # train model
    model.fit(features, labels)
    
    # get a (not-very-useful) sense of performance
    accuracy = model.score(features, labels)
    print("In-sample accuracy: {0:}".format(accuracy))
    
    return model

In [21]:
model_a = train_model(aX_train, ay_train)

In-sample accuracy: 0.9998780933804705


In [22]:
model_b = train_model(bX_train, by_train)

In-sample accuracy: 1.0


In [23]:
model_c = train_model(cX_train, cy_train)

In-sample accuracy: 1.0


In [24]:
# load test data
a_test = pd.read_csv(data_paths['A']['test'], index_col='id')
b_test = pd.read_csv(data_paths['B']['test'], index_col='id')
c_test = pd.read_csv(data_paths['C']['test'], index_col='id')

In [25]:
encoder = ce.HashingEncoder(cols= list(a_test.select_dtypes(include=['object']).columns.values))
encoder.fit(a_test)
aX_cleaned_test = encoder.transform(a_test)

In [26]:
encoder = ce.HashingEncoder(cols= list(b_test.select_dtypes(include=['object']).columns.values))
encoder.fit(b_test)
bX_cleaned_test = encoder.transform(b_test)

In [27]:
encoder = ce.HashingEncoder(cols= list(c_test.select_dtypes(include=['object']).columns.values))
encoder.fit(c_test)
cX_cleaned_test = encoder.transform(c_test)

In [28]:
# process the test data
a_test = pre_process_data(aX_cleaned_test, enforce_cols=aX_train.columns)
b_test = pre_process_data(bX_cleaned_test, enforce_cols=bX_train.columns)
c_test = pre_process_data(cX_cleaned_test, enforce_cols=cX_train.columns)

Input shape:	(4041, 12)
After standardization (4041, 12)
Input shape:	(1604, 31)
After standardization (1604, 31)
Input shape:	(3187, 38)
After standardization (3187, 38)


In [29]:
a_preds = model_a.predict_proba(a_test)
b_preds = model_b.predict_proba(b_test)
c_preds = model_c.predict_proba(c_test)

In [30]:
def make_country_sub(preds, test_feat, country):
    # make sure we code the country correctly
    country_codes = ['A', 'B', 'C']
    
    # get just the poor probabilities
    country_sub = pd.DataFrame(data=preds[:, 1],  # proba p=1
                               columns=['poor'], 
                               index=test_feat.index)

    
    # add the country code for joining later
    country_sub["country"] = country
    return country_sub[["country", "poor"]]


In [31]:
# convert preds to data frames
a_sub = make_country_sub(a_preds, a_test, 'A')
b_sub = make_country_sub(b_preds, b_test, 'B')
c_sub = make_country_sub(c_preds, c_test, 'C')

In [32]:
submission = pd.concat([a_sub, b_sub, c_sub])

In [33]:
submission.head()

Unnamed: 0_level_0,country,poor
id,Unnamed: 1_level_1,Unnamed: 2_level_1
418,A,0.46
41249,A,0.12
16205,A,0.42
97501,A,0.14
67756,A,0.08


In [34]:
submission.to_csv('submission_hashing.csv')