In [40]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import matplotlib.pyplot as plt

import copy
import datetime
import functools
import json, pickle
import sys

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, make_scorer, confusion_matrix, roc_curve
from sklearn.linear_model import LinearRegression
from sklearn.base import TransformerMixin

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline, make_pipeline

In [19]:
input_file_path = 'data/train.csv'
df_initial = pd.read_csv(input_file_path)

In [20]:
def rename_columns(df):    
    names_mapping = {}
    for col in df.columns:
        names_mapping[col] = col.replace(' ', '_')
    return df.rename_axis(names_mapping, axis='columns')

In [21]:
school_level_mapping = {
    'kindergarten': 0,
    'primary 1 through 4': 2,
    'primary school': 4,
    'secondary-5 through 6':6,
    'secondary-7 through 8': 8,    
    'secondary-9': 9,
    '10th': 10,
    'secondary 11': 11,
    'secondary': 12,
    'secondary 12': 12,
    'basic vocational': 14,
    'entry level college': 14,
    'advanced vocational': 15,
    'college graduate': 16,
    'some post graduate': 18,
    'advanced post graduate': 20,  
}

In [55]:
def extract_relevant_professions(profession):
    return profession if profession in ('C-level', 'specialist technician') else 'other'


def extract_relevant_job_types(job_type):
    return job_type if job_type in ('self-emp-inc', 'self-emp-not-inc') else 'other'


def extract_relevant_domestic_status(domestic_status):
    return domestic_status if domestic_status in ('married 2', 'single', 'd', 'divorce pending') else 'other'


def extract_relevant_domestic_relationship_types(domestic_relationship_type):
    return domestic_relationship_type if domestic_relationship_type in ('has spouse', 'living with child', 'never married', 'not living with family') else 'other'


def drop_variables(df_input, variables_to_drop):
    df = copy.deepcopy(df_input)
    for variable in variables_to_drop:
        df = df.drop(variable, axis=1)
    return df


def group_spouses(record):
    return ("has spouse" if record in ('has husband', 'has wife') else record)


def years_old(df):
    return (datetime.datetime.now().date() - pd.to_datetime(df.birth_date)).dt.days/365


def convert_school_level(val):
    return school_level_mapping[val]


def binarize_interest_earned(df):
    return df.interest_earned >0


def is_immigrant(df):
    return df.country_of_origin == 'u.s.'


def group_job_types(val):
    return job_type_map[val]


def is_white(df):
    return df.ethnicity == 'white and privileged'


def is_currently_single(df):
    return (
        (df.domestic_status ==  'single') | 
        (df.domestic_status ==   'd') | 
        (df.domestic_status ==   'spouse passed')
    )

def get_dummies(df, feature, relevant_labels):
    for l in relevant_labels:
        df[feature + l] = (df[feature] == l)
    df = df.drop(feature, axis=1)
    return df


def get_domestic_dummies(df, relevant):
    domestic_status = ['married 2', 'single', 'd', 'divorce pending', 'other']
    for s in domestic_status:
        df['domestic_status.' + s] = (df.domestic_status == s)
    df.drop()

In [61]:
def pipeline_2(df_in, **kwargs):
    df = copy.deepcopy(df_in)
    df = rename_columns(df)
    df = drop_variables(df, ['id', 'gender', 'earned_dividends', 'country_of_origin', 'ethnicity'])
    
    # Get age
    df['age'] = years_old(df)    
    df = drop_variables(df, ['birth_date'])
    
    # Convert school level in years
    df['school_years'] = df.school_level.apply(convert_school_level) 
    df = drop_variables(df, ['school_level'])
    
    # Group spouses and extract relevant domestic relationship types
    df['domestic_relationship_type'] = df.domestic_relationship_type.apply(group_spouses)
    df['domestic_relationship_type'] = df.domestic_relationship_type.apply(extract_relevant_domestic_relationship_types)
    df = get_dummies(df, 'domestic_relationship_type', ['has spouse', 'living with child', 'never married', 'not living with family', 'other'])
    # df = pd.get_dummies(df, prefix='domestic_relationship_type', columns=['domestic_relationship_type'], prefix_sep='.')  
    
    # Extract relevant professions
    df['profession'] = df.profession.apply(extract_relevant_professions)
    df = get_dummies(df, 'profession', ['C-level', 'specialist technician', 'other'])
    # df = pd.get_dummies(df, prefix='profession', columns=['profession'], prefix_sep='.')  
    
    # Extrct relevant job types
    df['job_type'] = df.job_type.apply(extract_relevant_job_types)
    df = get_dummies(df, 'job_type', ['self-emp-inc', 'self-emp-not-inc', 'other'])
    # df = pd.get_dummies(df, prefix='job_type', columns=['job_type'], prefix_sep='.')      
        
    # Extract relevant domestic status
    df['domestic_status'] = df.domestic_status.apply(extract_relevant_domestic_status)
    df = get_dummies(df, 'domestic_status', ['married 2', 'single', 'd', 'divorce pending', 'other'])
    # df = pd.get_dummies(df, prefix='domestic_status', columns=['domestic_status'], prefix_sep='.')      
     
    return df

In [62]:
class PreProcessingTransformer(TransformerMixin):
    def __init__(self):
        pass
    
    def transform(self, df, *_):
        print("Running pipeline")
        res = pipeline_2(df)
        print(res.columns)
        return res
    
    def fit(self, *_):
        return self

In [63]:
def cut_predictions_by_threshold(y, threshold):
    return np.ceil(y-threshold)

In [71]:
serialized_pipeline = make_pipeline(
    FunctionTransformer(func=pipeline_2, validate=False),
    LinearRegression(normalize=False),
)

In [65]:
serialized_pipeline = make_pipeline(
    PreProcessingTransformer(),
    LinearRegression(normalize=False),
)

In [27]:
y_train = copy.deepcopy(df_initial.target)
X_train = copy.deepcopy(df_initial)
X_train.pop('target')

0       1
1       1
2       0
3       0
4       1
5       1
6       1
7       0
8       1
9       1
10      1
11      0
12      1
13      1
14      1
15      1
16      1
17      1
18      0
19      1
20      1
21      0
22      1
23      1
24      1
25      1
26      1
27      1
28      1
29      1
       ..
8134    1
8135    1
8136    1
8137    1
8138    1
8139    1
8140    1
8141    1
8142    1
8143    1
8144    1
8145    1
8146    1
8147    1
8148    1
8149    1
8150    1
8151    1
8152    1
8153    1
8154    1
8155    1
8156    1
8157    0
8158    1
8159    1
8160    1
8161    1
8162    1
8163    1
Name: target, Length: 8164, dtype: int64

In [72]:
serialized_pipeline.fit(X_train, y_train)

  """


Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function pipeline_2 at 0x7fb202013048>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y='deprecated',
          validate=False)), ('linearregression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])

In [34]:
obs = {'birth date': '1983-12-26', 'job type': 'private', 'school level': 'secondary', 'domestic status': 'single', 'profession': 'mechanic', 'domestic relationship type': 'never married', 'ethnicity': 'afro american', 'gender': 'Female', 'earned dividends': 0, 'interest earned': 0, 'monthly work': 160, 'country of origin': 'u.s.', 'id': 1}

In [35]:
obs

{'birth date': '1983-12-26',
 'country of origin': 'u.s.',
 'domestic relationship type': 'never married',
 'domestic status': 'single',
 'earned dividends': 0,
 'ethnicity': 'afro american',
 'gender': 'Female',
 'id': 1,
 'interest earned': 0,
 'job type': 'private',
 'monthly work': 160,
 'profession': 'mechanic',
 'school level': 'secondary'}

In [37]:
res = pd.DataFrame([obs], columns=X_train.columns)

In [79]:
a = serialized_pipeline.predict(res)

  """


In [81]:
a.tolist()

[1.0096318962496686]

In [74]:
with open('app/columns.json', 'w') as fh:
    json.dump(X_train.columns.tolist(), fh)

In [75]:
with open('app/pipeline.pickle', 'wb') as fh:
    pickle.dump(serialized_pipeline, fh)

In [76]:
with open('app/dtypes.pickle', 'wb') as fh:
    pickle.dump(X_train.dtypes, fh)

In [26]:
def split_df(df):
    X = df.drop('target', axis=1)
    y = df.target
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)
    return X_train, X_test, y_train, y_test