In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix, roc_auc_score
from sklearn.feature_selection import RFE, SelectFromModel
from xgboost import plot_importance, XGBClassifier
from sklearn.ensemble import BaggingClassifier, VotingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 2000)

In [None]:
data_paths = {'A': {'train': { 'hhold': 'train/A_hhold_train.csv',
                               'indiv': 'train/A_indiv_train.csv'}, 
                    'test':  { 'hhold': 'test/A_hhold_test.csv',
                               'indiv': 'test/A_indiv_test.csv'}}, 
            
              'B': {'train': { 'hhold': 'train/B_hhold_train.csv',
                               'indiv': 'train/B_indiv_train.csv'}, 
                    'test':  { 'hhold': 'test/B_hhold_test.csv',
                               'indiv': 'test/B_indiv_test.csv'}}, 
              
              'C': {'train': { 'hhold': 'train/C_hhold_train.csv',
                               'indiv': 'train/C_indiv_train.csv'}, 
                    'test':  { 'hhold': 'test/C_hhold_test.csv',
                               'indiv': 'test/C_indiv_test.csv'}}}

In [None]:
a_hh_train = pd.read_csv(data_paths['A']['train']['hhold'], index_col='id')
b_hh_train = pd.read_csv(data_paths['B']['train']['hhold'], index_col='id')
c_hh_train = pd.read_csv(data_paths['C']['train']['hhold'], index_col='id')

a_in_train = pd.read_csv(data_paths['A']['train']['indiv'], index_col=['id', 'iid'])
b_in_train = pd.read_csv(data_paths['B']['train']['indiv'], index_col=['id', 'iid'])
c_in_train = pd.read_csv(data_paths['C']['train']['indiv'], index_col=['id', 'iid'])

a_hh_test = pd.read_csv(data_paths['A']['test']['hhold'], index_col='id')
b_hh_test = pd.read_csv(data_paths['B']['test']['hhold'], index_col='id')
c_hh_test = pd.read_csv(data_paths['C']['test']['hhold'], index_col='id')

a_in_test = pd.read_csv(data_paths['A']['test']['indiv'], index_col=['id', 'iid'])
b_in_test = pd.read_csv(data_paths['B']['test']['indiv'], index_col=['id', 'iid'])
c_in_test = pd.read_csv(data_paths['C']['test']['indiv'], index_col=['id', 'iid'])

train = {'a':{'hh': a_hh_train, 'in': a_in_train},
         'b':{'hh': b_hh_train, 'in': b_in_train},
         'c':{'hh': c_hh_train, 'in': c_in_train}}

test = {'a':{'hh': a_hh_test, 'in': a_in_test},
        'b':{'hh': b_hh_test, 'in': b_in_test},
        'c':{'hh': c_hh_test, 'in': c_in_test}}

In [None]:
def stdize_numeric(df, numeric_only=True):
    numeric = df.select_dtypes(include=['int64', 'float64'])    
    for column in numeric.columns:
        df[column] = (df[column] - df[column].mean()) / df[column].std()  
    
    return df


def pre_process_data(df, enforce_cols=None):
    print("Input shape:\t{}".format(df.shape))
    
    num_df = df.select_dtypes(include=['int64', 'float64'])
    
    num_df = stdize_numeric(num_df)
    
    print("Processing categorical data")
    
    cat_df = df.select_dtypes('object')
    
    for column in cat_df.columns:
        counts = cat_df[column].value_counts(normalize=True)
        cat_df[column] = cat_df[column].apply(lambda x: counts[x])
        
    df = pd.merge(cat_df, num_df, left_index=True, right_index=True)
        
    # match test set and training set columns
    if enforce_cols is not None:
        to_drop = np.setdiff1d(df.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, df.columns)

        df.drop(to_drop, axis=1, inplace=True)
        df = df.assign(**{c: 0 for c in to_add})
        df = df[enforce_cols]
    
    print("Final columns length: ", len(df.columns))
    df.fillna(0, inplace=True)
    
    return df

In [None]:
aX = pre_process_data(train['a']['hh'].drop(['poor', 'country'], axis=1))
ay = np.ravel(train['a']['hh']['poor']).astype(float)

bX = pre_process_data(train['b']['hh'].drop(['poor', 'country'], axis=1))
by = np.ravel(train['b']['hh']['poor']).astype(float)

cX = pre_process_data(train['c']['hh'].drop(['poor', 'country'], axis=1))
cy = np.ravel(train['c']['hh']['poor']).astype(float)

In [None]:
aX_train, aX_test, ay_train, ay_test = train_test_split(aX, ay, test_size=0.2, shuffle=True, stratify=ay)
bX_train, bX_test, by_train, by_test = train_test_split(bX, by, test_size=0.2, shuffle=True, stratify=by)
cX_train, cX_test, cy_train, cy_test = train_test_split(cX, cy, test_size=0.2, shuffle=True, stratify=cy)

In [None]:
def train_model(X_train, y_train, X_test = None, y_test = None, thresh=0.04):

    # model for feature selection
    model = XGBClassifier()
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict_proba(X_train)[:, 1]
    
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_train, predictions)
    cf = confusion_matrix(y_train, predictions)
    loss = log_loss(y_train, y_pred, normalize=True)
    auroc = roc_auc_score(y_train, y_pred)
    
    print("Train set accuracy: ", accuracy)
    print("Train set CF: \n", cf)
    print("Train set log loss: ", loss)
    print("Train set AUROC: ", auroc)
    
    if X_test is not None and y_test is not None:
        y_pred = model.predict_proba(X_test)[:, 1]
    
        predictions = [round(value) for value in y_pred]
        accuracy = accuracy_score(y_test, predictions)
        cf = confusion_matrix(y_test, predictions)
        loss = log_loss(y_test, y_pred, normalize=True)
        auroc = roc_auc_score(y_test, y_pred)
    
        print("Test set accuracy: ", accuracy)
        print("Test set CF: \n", cf)
        print("Test set log loss: ", loss)
        print("auroc: ", auroc)
    
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(X_train)
    
    # train model
    lr = LogisticRegression()
    svc = SVC(kernel='rbf', probability=True)
    lgbm = LGBMClassifier(max_depth=300,
                        num_leaves=127,
                        n_estimators=1000,
                        min_child_weight=1,
                        learning_rate=0.01,
                        nthread=16,
                        subsample=0.8,
                        colsample_bytree=1.0,
                        colsample_bylevel=1.0,
                        seed=42)
    xgbt = XGBClassifier(subsample=0.8, colsample_bytree=1.0, colsample_bylevel=1.0, n_estimators=200)
    
    estimators = [('lr', lr), ('svc', svc), ('xgb', xgbt), ('lgbm', lgbm)]

    selection_model = VotingClassifier(estimators=estimators, voting='soft', n_jobs=16)
    
    selection_model.fit(select_X_train, y_train)
    # eval model
    if X_test is not None and y_test is not None:
        select_X_test = selection.transform(X_test)

        y_pred = selection_model.predict_proba(select_X_test)[:]

        predictions = [round(value) for value in y_pred]
        accuracy = accuracy_score(y_test, predictions)
        cf = confusion_matrix(y_test, predictions)
        loss = log_loss(y_test, y_pred, normalize=True)
        auroc = roc_auc_score(y_test, y_pred)

        print("Test set accuracy: ", accuracy)
        print("Test set CF: \n", cf)
        print("Test set log loss: ", loss)
        print("auroc: ", auroc)
        print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1], accuracy*100.0))
        
    return selection, selection_model

In [None]:
selection_a, model_a = train_model(aX_train, ay_train, aX_test, ay_test, thresh=0.004) #0.004

In [None]:
selection_b, model_b = train_model(bX_train, by_train , bX_test, by_test, thresh=0.006) #0.006

In [None]:
selection_c, model_c = train_model(cX_train, cy_train, cX_test, cy_test, thresh=0.02) #0.02

In [None]:
def make_country_sub(preds, test_df, country):
    # make sure we code the country correctly
    country_codes = ['A', 'B', 'C']
    
    # get just the poor probabilities
    country_sub = pd.DataFrame(data=preds[:, 1],  # proba p=1
                               columns=['poor'],
                               index=test_df.index)
    
    # add the country code for joining later
    country_sub["country"] = country
    return country_sub[["country", "poor"]]

In [None]:
test_aX = pre_process_data(test['a']['hh'], enforce_cols = aX.columns)
test_bX = pre_process_data(test['b']['hh'], enforce_cols = bX.columns)
test_cX = pre_process_data(test['c']['hh'], enforce_cols = cX.columns)

In [None]:
test_fs_aX = selection_a.transform(test_aX) #feature selected
test_fs_bX = selection_b.transform(test_bX)
test_fs_cX = selection_c.transform(test_cX)

In [None]:
a_preds = model_a.predict_proba(test_fs_aX)
a_sub = make_country_sub(a_preds, test_aX, 'A')

b_preds = model_b.predict_proba(test_fs_bX)
b_sub = make_country_sub(b_preds, test_bX, 'B')

c_preds = model_c.predict_proba(test_fs_cX)
c_sub = make_country_sub(c_preds, test_cX, 'C')

In [None]:
a_sub.head()

In [None]:
b_sub.head()

In [None]:
c_sub.tail()

In [None]:
submission = pd.concat([a_sub, b_sub, c_sub])

In [None]:
submission.head()

In [None]:
submission.to_csv("sub.csv")