In [1]:
home = '/Users/mjnitz02/.kaggle/competitions/home-credit-default-risk/'

In [2]:
import pandas
import numpy
import sklearn
import sklearn.feature_selection
from sklearn import preprocessing

## Import Data

In [3]:
df_application_train = pandas.read_csv(home + 'application_train.csv')
df_bureau = pandas.read_csv(home + 'bureau.csv')
df_previous_application = pandas.read_csv(home + 'previous_application.csv')

## Identify different groups

In [4]:
target_ids = df_application_train['SK_ID_CURR'].unique()

In [5]:
target_bureau_ids = df_application_train.merge(
    df_bureau[['SK_ID_CURR']], how='inner', on=['SK_ID_CURR'])['SK_ID_CURR'].unique()

In [6]:
target_with_bureau = df_application_train[
    df_application_train.SK_ID_CURR.isin(target_bureau_ids)]

In [7]:
target_previous_ids = df_application_train.merge(
    df_previous_application[['SK_ID_CURR']], how='inner', on=['SK_ID_CURR'])['SK_ID_CURR'].unique()

In [8]:
target_with_previous = df_application_train[
    df_application_train.SK_ID_CURR.isin(target_previous_ids)]

In [9]:
target_bureau_previous_ids = df_application_train.merge(
    df_bureau[['SK_ID_CURR']], how='inner', on=['SK_ID_CURR']).merge(
    df_previous_application[['SK_ID_CURR']], how='inner', on=['SK_ID_CURR'])['SK_ID_CURR'].unique()

In [10]:
target_with_bureau_and_previous = df_application_train[
    df_application_train.SK_ID_CURR.isin(target_bureau_previous_ids)]

## Feature Evaluation

In [23]:
def examine_variable(df, variable):
    print('processing: %s' % variable)
    label_encoder = preprocessing.LabelEncoder()
    
    series = df[variable]
    target = df['TARGET']
    
    if series.dtype == numpy.object_:
        label_encoder.fit(series.unique())
        series = pandas.Series(label_encoder.transform(series.values))
        
        
    return {
        'variable_type': df[variable].dtype,
        'mutual_information': get_mutual_information(series, target),
        'stats': get_basic_statistics(series),
        'number_of_nan': count_nan_values(df[variable]),
        'count_unique_values': count_unique_values(df[variable])
    }

In [12]:
def get_mutual_information(series, target):
    return sklearn.feature_selection.mutual_info_classif(
        series.values.reshape(-1, 1), target.values)

In [13]:
def get_basic_statistics(series):
    return {
        'minimum': min(series.tolist()),
        'maximum': max(series.tolist()),
        'average': sum(series.tolist()) / len(series.tolist()),
    }

In [14]:
def count_nan_values(series):
    return series.isnull().sum()

In [15]:
def count_unique_values(series):
    return len(series.unique())

In [26]:
results = {}
for index, column in enumerate(target_with_bureau.columns):
    if 1 < index < 8:
        results[column] = examine_variable(target_with_bureau, column)

processing: NAME_CONTRACT_TYPE
processing: CODE_GENDER
processing: FLAG_OWN_CAR
processing: FLAG_OWN_REALTY
processing: CNT_CHILDREN
processing: AMT_INCOME_TOTAL


In [27]:
results

{'NAME_CONTRACT_TYPE': {'variable_type': dtype('O'),
  'mutual_information': array([0.00098732]),
  'stats': {'minimum': 0, 'maximum': 1, 'average': 0.09401080112793227},
  'number_of_nan': 0,
  'count_unique_values': 2},
 'CODE_GENDER': {'variable_type': dtype('O'),
  'mutual_information': array([0.00706695]),
  'stats': {'minimum': 0, 'maximum': 2, 'average': 0.33648208098189314},
  'number_of_nan': 0,
  'count_unique_values': 3},
 'FLAG_OWN_CAR': {'variable_type': dtype('O'),
  'mutual_information': array([0.00720169]),
  'stats': {'minimum': 0, 'maximum': 1, 'average': 0.3483230926293498},
  'number_of_nan': 0,
  'count_unique_values': 2},
 'FLAG_OWN_REALTY': {'variable_type': dtype('O'),
  'mutual_information': array([0.02660635]),
  'stats': {'minimum': 0, 'maximum': 1, 'average': 0.6901260384605167},
  'number_of_nan': 0,
  'count_unique_values': 2},
 'CNT_CHILDREN': {'variable_type': dtype('int64'),
  'mutual_information': array([0.00256865]),
  'stats': {'minimum': 0, 'maximum