In [7]:
# Run this if feature tools is not already installed
# !pip install -U featuretools

In [24]:
import pandas as pd
import numpy as np

# featuretools for automated feature engineering
import featuretools as ft

# ignore warnings from pandas
import warnings
warnings.filterwarnings('ignore')

In [39]:
# Read in the data
clients = pd.read_csv('Datasets/clients.csv', parse_dates = ['joined'])
loans = pd.read_csv('Datasets/loans.csv', parse_dates = ['loan_start', 'loan_end'])
payments = pd.read_csv('Datasets/payments.csv', parse_dates = ['payment_date'])

In [43]:
clients.head()

Unnamed: 0,client_id,joined,income,credit_score
0,46109,2002-04-16,172677,527
1,49545,2007-11-14,104564,770
2,41480,2013-03-11,122607,585
3,46180,2001-11-06,43851,562
4,25707,2006-10-06,211422,621


In [44]:
loans.sample(10)

Unnamed: 0,client_id,loan_type,loan_amount,repaid,loan_id,loan_start,loan_end,rate
381,38537,other,9457,1,10276,2004-07-09,2006-06-01,2.37
92,25707,cash,12623,1,11790,2004-09-15,2007-01-23,4.09
15,46109,credit,7339,1,10328,2001-09-24,2003-08-31,1.29
24,49545,cash,10422,1,10709,2008-11-04,2010-09-14,0.05
44,41480,home,14290,0,10888,2003-11-01,2006-04-29,3.76
139,35089,other,773,1,11595,2014-09-26,2016-04-23,7.63
382,38537,other,13130,0,10688,2013-07-12,2015-11-03,0.46
272,44601,other,6129,1,10289,2013-10-07,2015-05-11,2.76
281,44601,other,6232,1,11183,2003-10-17,2006-04-04,6.83
160,35214,other,14767,0,11539,2006-03-08,2007-12-24,1.58


In [45]:
payments.sample(10)

Unnamed: 0,loan_id,payment_amount,payment_date,missed
701,11411,589,2013-04-08,1
785,11079,1955,2007-03-10,1
1228,10908,874,2010-12-01,0
2992,11618,2443,2011-01-12,1
2669,11238,1611,2006-03-23,1
1615,11733,1139,2006-03-28,1
84,10856,1629,2005-12-03,0
544,11987,1714,2011-06-12,1
3104,10166,2279,2000-06-02,0
3061,10697,1720,2012-03-13,1


# EntitySet
Creating a new EntitySet

In [40]:
es = ft.EntitySet(id = 'clients')

In [42]:
# Create an entity from the client dataframe
# This dataframe already has an index and a time index
es = es.entity_from_dataframe(entity_id = 'clients', dataframe = clients, 
                              index = 'client_id', time_index = 'joined')

In [46]:
# Create an entity from the loans dataframe
# This dataframe already has an index and a time index
es = es.entity_from_dataframe(entity_id = 'loans', dataframe = loans, 
                              variable_types = {'repaid': ft.variable_types.Categorical},
                              index = 'loan_id', 
                              time_index = 'loan_start')

In [47]:
# Create an entity from the payments dataframe
# This does not yet have a unique index
es = es.entity_from_dataframe(entity_id = 'payments', 
                              dataframe = payments,
                              variable_types = {'missed': ft.variable_types.Categorical},
                              make_index = True,
                              index = 'payment_id',
                              time_index = 'payment_date')

In [48]:
es

Entityset: clients
  Entities:
    clients [Rows: 25, Columns: 4]
    loans [Rows: 443, Columns: 8]
    payments [Rows: 3456, Columns: 5]
  Relationships:
    No relationships

In [None]:
es['loans']