In [63]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
from scipy import stats

import matplotlib.pyplot as plt

# This function takes both the train and test data for individuals and households. It combines them by taking the mean of ind numerical data,
# taking the mode of ind categorical data, grouping by household id and joining into the household dataset
def make_model_xy():
    # data directory
    DATA_DIR = os.path.join('.', 'data', 'processed')
    data_paths = {'A': {'train': os.path.join(DATA_DIR, 'A', 'A_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'A', 'A_hhold_test.csv')}, 
              
              'B': {'train': os.path.join(DATA_DIR, 'B', 'B_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'B', 'B_hhold_test.csv')}, 
              
              'C': {'train': os.path.join(DATA_DIR, 'C', 'C_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'C', 'C_hhold_test.csv')}}
    
    # load household training data
    a_train = pd.read_csv(data_paths['A']['train'], index_col='id')
    b_train = pd.read_csv(data_paths['B']['train'], index_col='id')
    c_train = pd.read_csv(data_paths['C']['train'], index_col='id')
    data_paths_ind = {'A': {'train': os.path.join(DATA_DIR, 'A', 'A_ind_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'A', 'A_ind_test.csv')}, 
              
              'B': {'train': os.path.join(DATA_DIR, 'B', 'B_ind_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'B', 'B_ind_test.csv')}, 
              
              'C': {'train': os.path.join(DATA_DIR, 'C', 'C_ind_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'C', 'C_ind_test.csv')}}
    
    # load ind training data
    a_train_ind = pd.read_csv(data_paths_ind['A']['train'], index_col='id')
    b_train_ind = pd.read_csv(data_paths_ind['B']['train'], index_col='id')
    c_train_ind = pd.read_csv(data_paths_ind['C']['train'], index_col='id')
    
    # only numeric/bool values
    a_train_ind_num = a_train_ind.select_dtypes(['number', 'bool'])
    b_train_ind_num = b_train_ind.select_dtypes(['number', 'bool'])
    c_train_ind_num = c_train_ind.select_dtypes(['number', 'bool'])

    # everything else
    a_train_ind_cat = a_train_ind.select_dtypes(exclude=['number', 'bool'])
    b_train_ind_cat = b_train_ind.select_dtypes(exclude=['number', 'bool'])
    c_train_ind_cat = c_train_ind.select_dtypes(exclude=['number', 'bool'])

    # group by household, get mean/mode
    a_train_ind_num = a_train_ind_num.groupby(['id']).mean()
    b_train_ind_num = b_train_ind_num.groupby(['id']).mean()
    c_train_ind_num = c_train_ind_num.groupby(['id']).mean()

    a_train_ind_cat = a_train_ind_cat.groupby(['id']).agg(lambda x: tuple(stats.mode(x)[0]))
    b_train_ind_cat = b_train_ind_cat.groupby(['id']).agg(lambda x: tuple(stats.mode(x)[0]))
    c_train_ind_cat = c_train_ind_cat.groupby(['id']).agg(lambda x: tuple(stats.mode(x)[0]))
    
    a_train_ind = combine_horizontal(a_train_ind_num, a_train_ind_cat)
    b_train_ind = combine_horizontal(b_train_ind_num, b_train_ind_cat)
    c_train_ind = combine_horizontal(c_train_ind_num, c_train_ind_cat)
    
    # A
    aX_train_house = pre_process_data(a_train.drop('poor', axis=1))
    aX_train_ind = pre_process_data(a_train_ind.drop('poor', axis=1))

    aX_train = combine_horizontal(aX_train_house, aX_train_ind)

    ay_train_pre = pd.concat([a_train, a_train_ind], keys=['id'])
    ay_train = np.ravel(ay_train_pre.poor)

    # B
    bX_train_house = pre_process_data(b_train.drop('poor', axis=1))
    bX_train_ind = pre_process_data(b_train_ind.drop('poor', axis=1))

    bX_train = combine_horizontal(bX_train_house, bX_train_ind)

    by_train_pre = pd.concat([b_train, b_train_ind], keys=['id'])
    by_train = np.ravel(by_train_pre.poor)

    # C
    cX_train_house = pre_process_data(c_train.drop('poor', axis=1))
    cX_train_ind = pre_process_data(c_train_ind.drop('poor', axis=1))

    cX_train = combine_horizontal(cX_train_house, cX_train_ind)

    cy_train_pre = pd.concat([c_train, c_train_ind], keys=['id'])
    cy_train = np.ravel(cy_train_pre.poor)
    
    # load test data
    a_test = pd.read_csv(data_paths['A']['test'], index_col='id')
    b_test = pd.read_csv(data_paths['B']['test'], index_col='id')
    c_test = pd.read_csv(data_paths['C']['test'], index_col='id')
    # load ind test data
    a_test_ind = pd.read_csv(data_paths_ind['A']['test'], index_col='id')
    b_test_ind = pd.read_csv(data_paths_ind['B']['test'], index_col='id')
    c_test_ind = pd.read_csv(data_paths_ind['C']['test'], index_col='id')
    
    # only numeric/bool values
    a_test_ind_num = a_test_ind.select_dtypes(['number', 'bool'])
    b_test_ind_num = b_test_ind.select_dtypes(['number', 'bool'])
    c_test_ind_num = c_test_ind.select_dtypes(['number', 'bool'])
    # everything else
    a_test_ind_cat = a_test_ind.select_dtypes(exclude=['number', 'bool'])
    b_test_ind_cat = b_test_ind.select_dtypes(exclude=['number', 'bool'])
    c_test_ind_cat = c_test_ind.select_dtypes(exclude=['number', 'bool'])
    
    # group by household, get mean
    a_test_ind_num = a_test_ind_num.groupby(['id']).mean()
    b_test_ind_num = b_test_ind_num.groupby(['id']).mean()
    c_test_ind_num = c_test_ind_num.groupby(['id']).mean()

    a_test_ind_cat = a_test_ind_cat.groupby(['id']).agg(lambda x: tuple(stats.mode(x)[0]))
    b_test_ind_cat = b_test_ind_cat.groupby(['id']).agg(lambda x: tuple(stats.mode(x)[0]))
    c_test_ind_cat = c_test_ind_cat.groupby(['id']).agg(lambda x: tuple(stats.mode(x)[0]))
    a_test_ind = combine_horizontal(a_test_ind_num, a_test_ind_cat)
    b_test_ind = combine_horizontal(b_test_ind_num, b_test_ind_cat)
    c_test_ind = combine_horizontal(c_test_ind_num, c_test_ind_cat)
    
    # A test
    aX_test_house = pre_process_data(a_test, enforce_cols=aX_train_house.columns)
    aX_test_ind = pre_process_data(a_test_ind, enforce_cols=aX_train_ind.columns)

    a_test = combine_horizontal(aX_test_house, aX_test_ind)

    # B test
    bX_test_house = pre_process_data(b_test, enforce_cols=bX_train_house.columns)
    bX_test_ind = pre_process_data(b_test_ind, enforce_cols=bX_train_ind.columns)

    b_test = combine_horizontal(bX_test_house, bX_test_ind)

    # C test
    cX_test_house = pre_process_data(c_test, enforce_cols=cX_train_house.columns)
    cX_test_ind = pre_process_data(c_test_ind, enforce_cols=cX_train_ind.columns)

    c_test = combine_horizontal(cX_test_house, cX_test_ind)


    
    return {
            'ax': aX_train, 'ay': ay_train, 'at': a_test,
            'bx': bX_train, 'by': by_train, 'bt': b_test,
            'cx': cX_train, 'cy': cy_train, 'ct': c_test
           } 
    
    

In [64]:
def combine_horizontal(hh, ind):
  return pd.concat([hh, ind], axis=1)

In [65]:
# Standardize features
def standardize(df, numeric_only=True):
    numeric = df.select_dtypes(include=['int64', 'float64'])
    
    # subtracy mean and divide by std
    df[numeric.columns] = (numeric - numeric.mean()) / numeric.std()
    
    return df
    

def pre_process_data(df, enforce_cols=None):
    print("Input shape:\t{}".format(df.shape))
        
    if 'iid' in df.columns:
        df = standardize(df.drop('iid', axis=1))
    else:
        df=standardize(df)
        
    df = standardize(df)
    print("After standardization {}".format(df.shape))
        
    # create dummy variables for categoricals
    df = pd.get_dummies(df)
    if 'poor' in df.columns:
        df.poor = (df.poor == True).astype(int)
    print("After converting categoricals:\t{}".format(df.shape))
    

    # match test set and training set columns
    if enforce_cols is not None:
        to_drop = np.setdiff1d(df.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, df.columns)

        df.drop(to_drop, axis=1, inplace=True)
        df = df.assign(**{c: 0 for c in to_add})
    
    df.fillna(0, inplace=True)
    
    return df

In [66]:
from sklearn.ensemble import RandomForestClassifier

# instantiates and trains Random Forest model
def train_model(features, labels, **kwargs):
    
    # instantiate model
    model = RandomForestClassifier(n_estimators=50, random_state=0)
    
    # train model
    model.fit(features, labels)
    
    # get a (not-very-useful) sense of performance
#     accuracy = model.score(features, labels)
#     print(f"In-sample accuracy: {accuracy:0.2%}")
    
    return model



In [67]:
def make_country_sub(preds, test_feat, country):
    # make sure we code the country correctly
    country_codes = ['A', 'B', 'C']
    
    # get just the poor probabilities
    country_sub = pd.DataFrame(data=preds[:, 1],  # proba p=1
                               columns=['poor'], 
                               index=test_feat.index)

    
    # add the country code for joining later
    country_sub["country"] = country
    return country_sub[["country", "poor"]]

In [68]:
def format_sub(data):
    sub_format = pd.read_csv('./submission.csv').drop(['poor', 'country'], axis=1)
    submission = pd.read_csv(data)
    submission_ordered = pd.merge(submission, sub_format, on='id').sort_values(by='order').drop('order', axis=1)
    submission_ordered.to_csv(data)

In [69]:
from sklearn.neighbors import KNeighborsClassifier

# instantiates and trains K Nearest Neighbors model
def knn_train_model(features, labels, **kwargs):
    
    # instantiate model
    model = KNeighborsClassifier(n_neighbors=20)
    
    # train model
    model.fit(features, labels)
    
    return model

In [70]:
from sklearn.tree import DecisionTreeClassifier

# instantiates and trains Decision Tree model
def dt_train_model(features, labels, **kwargs):
    
    # instantiate model
    model = DecisionTreeClassifier(random_state = 11)
    
    # train model
    model.fit(features, labels)
    
    return model

In [71]:
from sklearn.linear_model import LogisticRegression

# instantiates and trains Logistic Regression model
def lr_train_model(features, labels, **kwargs):
    
    # instantiate model
    model = LogisticRegression()
    
    # train model
    model.fit(features, labels)
    
    return model

In [72]:
from sklearn.naive_bayes import GaussianNB

# instantiates and trains Naive Bayes model
def nb_train_model(features, labels, **kwargs):
    
    # instantiate model
    model = GaussianNB()
    
    # train model
    model.fit(features, labels)
    
    return model