In [36]:
#!pip install category_encoders

In [1]:
#http://drivendata.co/blog/worldbank-poverty-benchmark/

%matplotlib inline

import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# data directory
DATA_DIR = os.path.join(r'C:\Users\piush\Desktop\Dataset\world_bank_poverty_data\household')

In [2]:
data_paths = {'A': {'train': os.path.join(DATA_DIR,  'A_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR,  'A_hhold_test.csv')}, 
              
              'B': {'train': os.path.join(DATA_DIR,  'B_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR,  'B_hhold_test.csv')}, 
              
              'C': {'train': os.path.join(DATA_DIR,  'C_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR,  'C_hhold_test.csv')}}

In [3]:
# load training data
a_train = pd.read_csv(data_paths['A']['train'], index_col='id')
b_train = pd.read_csv(data_paths['B']['train'], index_col='id')
c_train = pd.read_csv(data_paths['C']['train'], index_col='id')

In [4]:
class RankCategorical(object):

    def __init__(self, columns, new_column=False, inverse=False):
        self.rank_dict = {}
        self.columns = columns
        self.new_column = new_column
        self.inverse = inverse
        
    
    def fit(self, df):

        for column in self.columns:
            count = df[column].value_counts()

            if self.inverse:
                count = count.sort_values()
            ranks = [i for i in range(1, count.shape[0] + 1)]
            count = pd.DataFrame(ranks, index=count.index.values)
            self.rank_dict[column] = count.to_dict()
        
    def transform(self, df):
        for column in self.columns:
            
            new_column_name = column
            
            if self.new_column:
                new_column_name = column + "_rankcategorical"

            missing = len(self.rank_dict) / 2
                        
            df[new_column_name] = df[column].apply(lambda x : self.rank_dict[column][0].get(x, missing))


##### Drop country and poor in the training set

In [5]:
ay_train = np.ravel(a_train.poor)

In [7]:
by_train = np.ravel(b_train.poor)


In [8]:
cy_train = np.ravel(c_train.poor)

###### Convert only the categorical values

In [12]:
for i in list(a_train.select_dtypes(include=['object']).columns.values):
    lc = RankCategorical([i], inverse=False, new_column=False)
    lc.fit(a_train)
    lc.transform(a_train)
    

In [14]:
for i in list(b_train.select_dtypes(include=['object']).columns.values):
    lc = RankCategorical([i], inverse=False, new_column=False)
    lc.fit(b_train)
    lc.transform(b_train)
    

In [15]:
for i in list(c_train.select_dtypes(include=['object']).columns.values):
    lc = RankCategorical([i], inverse=False, new_column=False)
    lc.fit(c_train)
    lc.transform(c_train)
    

In [16]:
# from sklearn.preprocessing import LabelEncoder
# a_train = a_train.apply(LabelEncoder().fit_transform)

In [17]:
# Standardize features
def standardize(df, numeric_only=True):
    numeric = df.select_dtypes(include=['int64', 'float64'])
    
    # subtracy mean and divide by std
    df[numeric.columns] = (numeric - numeric.mean()) / numeric.std()
    
    return df
    

def pre_process_data(df, enforce_cols=None):
    print("Input shape:\t{}".format(df.shape))
        

    df = standardize(df)
    print("After standardization {}".format(df.shape))
        
#     # create dummy variables for categoricals
#     df = pd.get_dummies(df)
#     print("After converting categoricals:\t{}".format(df.shape))
    

    # match test set and training set columns
    if enforce_cols is not None:
        to_drop = np.setdiff1d(df.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, df.columns)

        df.drop(to_drop, axis=1, inplace=True)
        df = df.assign(**{c: 0 for c in to_add})
    
    df.fillna(0, inplace=True)
    
    return df

In [18]:
print("Country A")
aX_train = pre_process_data(a_train.drop(['poor','country'], axis=1))


print("\nCountry B")
bX_train = pre_process_data(b_train.drop(['poor','country'], axis=1))

print("\nCountry C")
cX_train = pre_process_data(c_train.drop(['poor','country'], axis=1))


Country A
Input shape:	(8203, 343)
After standardization (8203, 343)

Country B
Input shape:	(3255, 440)
After standardization (3255, 440)

Country C
Input shape:	(6469, 162)
After standardization (6469, 162)


In [19]:
from sklearn.ensemble import RandomForestClassifier

def train_model(features, labels, **kwargs):
    
    # instantiate model
    model = RandomForestClassifier(n_estimators=50, random_state=0)
    
    # train model
    model.fit(features, labels)
    
    # get a (not-very-useful) sense of performance
    accuracy = model.score(features, labels)
    print("In-sample accuracy: {0:}".format(accuracy))
    
    return model

In [20]:
model_a = train_model(aX_train, ay_train)

In-sample accuracy: 1.0


In [21]:
model_b = train_model(bX_train, by_train)

In-sample accuracy: 0.9996927803379416


In [22]:
model_c = train_model(cX_train, cy_train)

In-sample accuracy: 1.0


In [23]:
# load test data
a_test = pd.read_csv(data_paths['A']['test'], index_col='id')
b_test = pd.read_csv(data_paths['B']['test'], index_col='id')
c_test = pd.read_csv(data_paths['C']['test'], index_col='id')

In [24]:
for i in list(a_test.select_dtypes(include=['object']).columns.values):
    lc = RankCategorical([i], inverse=False, new_column=False)
    lc.fit(a_test)
    lc.transform(a_test)

In [25]:
for i in list(b_test.select_dtypes(include=['object']).columns.values):
    lc = RankCategorical([i], inverse=False, new_column=False)
    lc.fit(b_test)
    lc.transform(b_test)

In [26]:
for i in list(c_test.select_dtypes(include=['object']).columns.values):
    lc = RankCategorical([i], inverse=False, new_column=False)
    lc.fit(c_test)
    lc.transform(c_test)

In [27]:
# process the test data
a_test = pre_process_data(a_test, enforce_cols=aX_train.columns)
b_test = pre_process_data(b_test, enforce_cols=bX_train.columns)
c_test = pre_process_data(c_test, enforce_cols=cX_train.columns)

Input shape:	(4041, 344)
After standardization (4041, 344)
Input shape:	(1604, 441)
After standardization (1604, 441)
Input shape:	(3187, 163)
After standardization (3187, 163)


In [28]:
a_preds = model_a.predict_proba(a_test)
b_preds = model_b.predict_proba(b_test)
c_preds = model_c.predict_proba(c_test)

In [29]:
def make_country_sub(preds, test_feat, country):
    # make sure we code the country correctly
    country_codes = ['A', 'B', 'C']
    
    # get just the poor probabilities
    country_sub = pd.DataFrame(data=preds[:, 1],  # proba p=1
                               columns=['poor'], 
                               index=test_feat.index)

    
    # add the country code for joining later
    country_sub["country"] = country
    return country_sub[["country", "poor"]]


In [30]:
# convert preds to data frames
a_sub = make_country_sub(a_preds, a_test, 'A')
b_sub = make_country_sub(b_preds, b_test, 'B')
c_sub = make_country_sub(c_preds, c_test, 'C')

In [31]:
submission = pd.concat([a_sub, b_sub, c_sub])

In [32]:
submission.head()

Unnamed: 0_level_0,country,poor
id,Unnamed: 1_level_1,Unnamed: 2_level_1
418,A,0.46
41249,A,0.14
16205,A,0.42
97501,A,0.2
67756,A,0.78


In [34]:
submission.to_csv('submission_rank_count.csv')