In [33]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import matplotlib.pyplot as plt

import copy
import datetime
import functools 

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, make_scorer, confusion_matrix, roc_curve
from sklearn.linear_model import LinearRegression

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline, make_pipeline

In [39]:
import sys

'3.5.2 (default, Nov 23 2017, 16:37:01) \n[GCC 5.4.0 20160609]'

In [4]:
input_file_path = 'data/train.csv'
df_initial = pd.read_csv(input_file_path)

In [5]:
def rename_columns(df):    
    names_mapping = {}
    for col in df.columns:
        names_mapping[col] = col.replace(' ', '_')
    return df.rename_axis(names_mapping, axis='columns')

In [6]:
school_level_mapping = {
    'kindergarten': 0,
    'primary 1 through 4': 2,
    'primary school': 4,
    'secondary-5 through 6':6,
    'secondary-7 through 8': 8,    
    'secondary-9': 9,
    '10th': 10,
    'secondary 11': 11,
    'secondary': 12,
    'secondary 12': 12,
    'basic vocational': 14,
    'entry level college': 14,
    'advanced vocational': 15,
    'college graduate': 16,
    'some post graduate': 18,
    'advanced post graduate': 20,  
}

In [7]:
def extract_relevant_professions(profession):
    return profession if profession in ('C-level', 'specialist technician') else 'other'


def extract_relevant_job_types(job_type):
    return job_type if job_type in ('self-emp-inc', 'self-emp-not-inc') else 'other'


def extract_relevant_domestic_status(domestic_status):
    return domestic_status if domestic_status in ('married 2', 'single', 'd', 'divorce pending') else 'other'


def extract_relevant_domestic_relationship_types(domestic_relationship_type):
    return domestic_relationship_type if domestic_relationship_type in ('has spouse', 'living with child', 'never married', 'not living with family') else 'other'


def drop_variables(df_input, variables_to_drop):
    df = copy.deepcopy(df_input)
    for variable in variables_to_drop:
        df = df.drop(variable, axis=1)
    return df


def group_spouses(record):
    return ("has spouse" if record in ('has husband', 'has wife') else record)


def years_old(df):
    return (datetime.datetime.now().date() - pd.to_datetime(df.birth_date)).dt.days/365


def convert_school_level(val):
    return school_level_mapping[val]


def binarize_interest_earned(df):
    return df.interest_earned >0


def is_immigrant(df):
    return df.country_of_origin == 'u.s.'


def group_job_types(val):
    return job_type_map[val]


def is_white(df):
    return df.ethnicity == 'white and privileged'


def is_currently_single(df):
    return (
        (df.domestic_status ==  'single') | 
        (df.domestic_status ==   'd') | 
        (df.domestic_status ==   'spouse passed')
    )

In [8]:
def pipeline_2(df_in, **kwargs):
    df = copy.deepcopy(df_in)
    df = rename_columns(df)
    df = drop_variables(df, ['id', 'gender', 'earned_dividends', 'country_of_origin', 'ethnicity'])
    
    # Get age
    df['age'] = years_old(df)    
    df = drop_variables(df, ['birth_date'])
    
    # Convert school level in years
    df['school_years'] = df.school_level.apply(convert_school_level) 
    df = drop_variables(df, ['school_level'])
    
    # Group spouses and extract relevant domestic relationship types
    df['domestic_relationship_type'] = df.domestic_relationship_type.apply(group_spouses)
    df['domestic_relationship_type'] = df.domestic_relationship_type.apply(extract_relevant_domestic_relationship_types)
    df = pd.get_dummies(df, prefix='domestic_relationship_type', columns=['domestic_relationship_type'], prefix_sep='.')  
    
    # Extract relevant professions
    df['profession'] = df.profession.apply(extract_relevant_professions)
    df = pd.get_dummies(df, prefix='profession', columns=['profession'], prefix_sep='.')  
    
    # Extrct relevant job types
    df['job_type'] = df.job_type.apply(extract_relevant_job_types)
    df = pd.get_dummies(df, prefix='job_type', columns=['job_type'], prefix_sep='.')      
        
    # Extract relevant domestic status
    df['domestic_status'] = df.domestic_status.apply(extract_relevant_domestic_status)
    df = pd.get_dummies(df, prefix='domestic_status', columns=['domestic_status'], prefix_sep='.')      
     
    return df

In [9]:
pipeline_2(df_initial)

Unnamed: 0,interest_earned,monthly_work,target,age,school_years,domestic_relationship_type.has spouse,domestic_relationship_type.living with child,domestic_relationship_type.never married,domestic_relationship_type.not living with family,domestic_relationship_type.other,...,profession.other,profession.specialist technician,job_type.other,job_type.self-emp-inc,job_type.self-emp-not-inc,domestic_status.d,domestic_status.divorce pending,domestic_status.married 2,domestic_status.other,domestic_status.single
0,0,160,1,34.060274,12,0,0,1,0,0,...,1,0,1,0,0,0,0,0,0,1
1,0,160,1,55.060274,16,0,0,1,0,0,...,1,0,1,0,0,0,0,0,1,0
2,0,200,0,38.060274,16,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,0,208,0,52.060274,14,1,0,0,0,0,...,1,0,1,0,0,0,0,1,0,0
4,0,300,1,50.060274,14,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
5,0,140,1,21.060274,14,0,1,0,0,0,...,1,0,1,0,0,0,0,0,0,1
6,0,160,1,43.060274,6,0,0,1,0,0,...,1,0,1,0,0,1,0,0,0,0
7,0,160,0,36.060274,12,1,0,0,0,0,...,1,0,1,0,0,0,0,1,0,0
8,0,232,1,24.060274,10,0,0,1,0,0,...,1,0,1,0,0,1,0,0,0,0
9,0,160,1,43.060274,14,0,0,1,0,0,...,1,0,1,0,0,1,0,0,0,0


In [32]:
def cut_predictions_by_threshold(y, threshold):
    return np.ceil(y-threshold)

In [35]:
cut_087 = functools.partial(cut_predictions_by_threshold, threshold=0.87)

In [38]:
serialized_pipeline = make_pipeline(
    FunctionTransformer(func=pipeline_2, validate=False),
    LinearRegression(normalize=False),
)

In [19]:
def split_df(df):
    X = df.drop('target', axis=1)
    y = df.target
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)
    return X_train, X_test, y_train, y_test

In [46]:
import json, pickle

In [44]:
with open('columns.json', 'w') as fh:
    json.dump(X_train.columns.tolist(), fh)

In [48]:
with open('pipeline.pickle', 'wb') as fh:
    pickle.dump(serialized_pipeline, fh)

In [49]:
with open('dtypes.pickle', 'wb') as fh:
    pickle.dump(X_train.dtypes, fh)

In [51]:
X_train.dtypes

id                             int64
birth date                    object
job type                      object
school level                  object
domestic status               object
profession                    object
domestic relationship type    object
ethnicity                     object
gender                        object
earned dividends               int64
interest earned                int64
monthly work                   int64
country of origin             object
dtype: object