In [None]:
# https://towardsdatascience.com/automated-feature-engineering-in-python-99baf11cc219

In [None]:
import pandas as pd
import numpy as np
import featuretools as ft
pd.set_option('display.width', 5000)

In [None]:
loans_dtypes = {
    'client_id': np.int64,
    'loan_type': str
}
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')

clients = pd.read_csv("../data/clients.csv",delimiter=",")
loans = pd.read_csv("../data/loans.csv",delimiter=",",parse_dates=['loan_start','loan_end'], date_parser=dateparse, dtype=loans_dtypes)
payments = pd.read_csv("../data/payments.csv",delimiter=",",parse_dates=['payment_date'], date_parser=dateparse) #, dtype=loans_dtypes)

In [None]:
clients.head()

In [None]:
loans.head()

In [None]:
# here's a way to aggregate data without featuretools
import pandas as pd

# Group loans by client id and calculate mean, max, min of loans
stats = loans.groupby('client_id')['loan_amount'].agg(['mean', 'max', 'min'])
stats.columns = ['mean_loan_amount', 'max_loan_amount', 'min_loan_amount']

# Merge with the clients dataframe
stats = clients.merge(stats, left_on = 'client_id', right_index=True, how = 'left')

stats.head(10)

In [None]:
# Create new entityset
es = ft.EntitySet(id = 'clients')

In [None]:
# Create an entity from the client dataframe
# This dataframe already has an index and a time index
es = es.entity_from_dataframe(entity_id = 'clients', dataframe = clients, 
                              index = 'client_id', time_index = 'joined')

In [None]:
# Create an entity from the loans dataframe
# This dataframe already has an index and a time index
es = es.entity_from_dataframe(entity_id = 'loans', dataframe = loans, 
                              index = 'loan_id')

In [None]:
# Create an entity from the payments dataframe
# This does not yet have a unique index so make_index=True
es = es.entity_from_dataframe(entity_id = 'payments', 
                              dataframe = payments,
                              variable_types = {'missed': ft.variable_types.Categorical},
                              make_index = True,
                              index = 'payment_id',
                              time_index = 'payment_date')

In [None]:
print(es)

In [None]:
es['payments']

In [None]:
# Relationship between clients and previous loans
r_client_previous = ft.Relationship(es['clients']['client_id'],
                                    es['loans']['client_id'])

# Add the relationship to the entity set
es = es.add_relationship(r_client_previous)

# Relationship between previous loans and previous payments
r_payments = ft.Relationship(es['loans']['loan_id'],
                                      es['payments']['loan_id'])

# Add the relationship to the entity set
es = es.add_relationship(r_payments)

es

In [None]:
# Create new features using specified primitives
features, feature_names = ft.dfs(entityset = es, target_entity = 'clients', 
                                 agg_primitives = ['mean', 'max', 'percent_true', 'last'],
                                 trans_primitives = ['percentile','month','year','hour'])

print(features.columns.values)

In [None]:
# let featuretools determine features automatically
features, feature_names = ft.dfs(entityset = es, target_entity = 'clients', 
                                 max_depth=2)

features.drop('income', axis=1, inplace=True)
features.drop('credit_score', axis=1, inplace=True)
print(features.columns.values)

In [None]:
type(features)

In [None]:
clients_full = clients.merge(features, left_on = 'client_id', right_index=True, how = 'left')

In [None]:
clients_full.head()

In [None]:
# use some automl
import h2o
h2o.init()

In [None]:
# Load a pandas data frame to H2O
hf = h2o.H2OFrame(clients_full)

In [None]:
x = hf.names
x.remove("credit_score")
Y = "credit_score"

In [None]:
train, test = hf.split_frame([0.7], seed=42)

In [None]:
from h2o.estimators.xgboost import H2OXGBoostEstimator
xgb = H2OXGBoostEstimator(nfolds=3, seed=1)
xgb.train(x=x, y=Y, training_frame=train,
         validation_frame=test)

print(xgb)

In [None]:
from h2o.automl import H2OAutoML
autoML = H2OAutoML(max_runtime_secs=240)
autoML.train(x=x,
             y=Y,
             training_frame=train)

leaderboard = autoML.leaderboard
print(leaderboard)