In [2]:
import pandas as pd
import numpy as np

# automated feature engineering
import featuretools as ft


In [4]:
primitives = ft.list_primitives()
pd.options.display.max_colwidth = 100
primitives[primitives['type'] == 'aggregation']
primitives[primitives['type'] == 'transform']

Unnamed: 0,name,type,description
19,cum_sum,transform,Calculates the sum of previous values of an instance for each value in a time-dependent entity.
20,mod,transform,Creates a transform feature that divides two features.
21,longitude,transform,Returns the second value on the tuple base feature.
22,time_since,transform,Calculates time since the cutoff time.
23,years,transform,Transform a Timedelta feature into the number of years.
24,not,transform,"For each value of the base feature, negates the boolean value."
25,weeks,transform,Transform a Timedelta feature into the number of weeks.
26,haversine,transform,Calculate the approximate haversine distance in miles between two LatLong variable types.
27,second,transform,Transform a Datetime feature into the second.
28,and,transform,"For two boolean values, determine if both values are 'True'."


为了方便计算，这里训练集和测试集数据并在了一起

In [None]:
app_train = pd.read_csv("app_train_new.csv").reset_index(drop = True)
app_test = pd.read_csv("app_test_new.csv").reset_index(drop = True)
bureau = pd.read_csv("bureau_new.csv").reset_index(drop = True)
bureau_balance = pd.read_csv("bureau_balance_new.csv").reset_index(drop = True)
cash = pd.read_csv('POS_CASH_balance_new.csv').reset_index(drop = True)
credit = pd.read_csv('credit_card_balance_new.csv').reset_index(drop = True)
previous = pd.read_csv('previous_application_new.csv').reset_index(drop = True)
installments = pd.read_csv('installments_payments_new.csv').reset_index(drop = True)


app_train['set'] = 'train'
app_test['set'] = 'test'
app_test["TARGET"] = np.nan

# Append the dataframes
app = app_train.append(app_test, ignore_index = True)

data_sets = dict(
                app=app, 
#                  app_test=app_test,
                 bureau=bureau, 
                 bureau_balance=bureau_balance
                 cash=cash, 
                 credit=credit, previous=previous, installments=installments)

for ds_name, ds in data_sets.items():
    if "Unnamed: 0" in ds.columns:
        data_sets[ds_name] = ds.drop("Unnamed: 0", axis=1)

## 将日期数据转化成时间序列

因为所有日期相关的列都是相对时间，首先我们可以定一个特定的日期作为参考日，这里我们就定2016-01-01日期参考点，所有日期相关列都根据该日期转化为时间序列：

In [None]:
start_date = pd.Timestamp("2016-01-01")

for ds_name, ds in data_sets.items():
    for col in ds.columns:
        if col.startswith("DAYS_"):
            ds[col.replace("DAYS_", "TSD_")] = start_date + pd.to_timedelta(ds[col], 'D')
            data_sets[ds_name] = ds.drop(col, axis=1)
            print(ds_name, col, "has been converted into time series", col.replace("DAYS", "TSD_"))
        elif col.startswith("MONTHS_"):
            ds[col.replace("MONTHS_", "TSM_")] = start_date + pd.to_timedelta(ds[col], 'M')
            data_sets[ds_name] = ds.drop(col, axis=1)
            print(ds_name, col, "has been converted into time series", col.replace("MONTHS_", "TSM_"))


In [None]:
es = ft.EntitySet(id = 'clients')
# Entities with a unique index
es = es.entity_from_dataframe(entity_id = 'app', dataframe = app, 
                              index = 'SK_ID_CURR', variable_types = app_types)

es = es.entity_from_dataframe(entity_id = 'bureau', dataframe = bureau, 
                              index = 'SK_ID_BUREAU', time_index='TSD_CREDIT')

es = es.entity_from_dataframe(entity_id = 'previous', dataframe = previous, 
                              index = 'SK_ID_PREV', time_index = 'TSD_DECISION',
                              variable_types = previous_types)

# Entities that do not have a unique index
es = es.entity_from_dataframe(entity_id = 'bureau_balance', dataframe = bureau_balance, 
                              make_index = True, index = 'bb_index',
                              time_index = 'TSM_BALANCE')

es = es.entity_from_dataframe(entity_id = 'cash', dataframe = cash, 
                              make_index = True, index = 'cash_index',
                              time_index = 'TSM_BALANCE')

es = es.entity_from_dataframe(entity_id = 'installments', dataframe = installments,
                              make_index = True, index = 'installments_index',
                              time_index = 'TSD_ENTRY_PAYMENT')

es = es.entity_from_dataframe(entity_id = 'credit', dataframe = credit,
                              make_index = True, index = 'credit_index',
                              time_index = 'TSM_BALANCE')

In [None]:
# Relationship between app and bureau
r_app_bureau = ft.Relationship(es['app']['SK_ID_CURR'], es['bureau']['SK_ID_CURR'])

# Relationship between bureau and bureau balance
r_bureau_balance = ft.Relationship(es['bureau']['SK_ID_BUREAU'], es['bureau_balance']['SK_ID_BUREAU'])

# Relationship between current app and previous apps
r_app_previous = ft.Relationship(es['app']['SK_ID_CURR'], es['previous']['SK_ID_CURR'])

# Relationships between previous apps and cash, installments, and credit
r_previous_cash = ft.Relationship(es['previous']['SK_ID_PREV'], es['cash']['SK_ID_PREV'])
r_previous_installments = ft.Relationship(es['previous']['SK_ID_PREV'], es['installments']['SK_ID_PREV'])
r_previous_credit = ft.Relationship(es['previous']['SK_ID_PREV'], es['credit']['SK_ID_PREV'])

es = es.add_relationships([r_app_bureau, r_bureau_balance, r_app_previous,
                           r_previous_cash, r_previous_installments, r_previous_credit])