In [1]:
from sklearn.preprocessing import LabelEncoder
from xverse.transformer import WOE
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

In [2]:
sys.path.append(os.path.abspath('../scripts'))
data=pd.read_csv('../data/data.csv')

In [3]:
from feature_engineering import (create_aggregate_features,extract_temporal_features,
    one_hot_encode,
    label_encode,
    handle_missing_values,
    normalize_features,
    standardize_features,
    calculate_woe_iv
)

In [4]:
data_with_aggregates = create_aggregate_features(data, customer_id_col='CustomerId', amount_col='Amount')
print(data_with_aggregates.head())

        CustomerId  Total_Transaction_Amount  Average_Transaction_Amount  \
0     CustomerId_1                  -10000.0               -10000.000000   
1    CustomerId_10                  -10000.0               -10000.000000   
2  CustomerId_1001                   20000.0                 4000.000000   
3  CustomerId_1002                    4225.0                  384.090909   
4  CustomerId_1003                   20000.0                 3333.333333   

   Transaction_Count  Std_Transaction_Amount  
0                  1                     NaN  
1                  1                     NaN  
2                  5             6558.963333  
3                 11              560.498966  
4                  6             6030.478146  


In [5]:
data_with_temporal_features = extract_temporal_features(data)
print(data_with_temporal_features.head())

         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId CurrencyCode  CountryCode    ProviderId     ProductId  \
0  CustomerId_4406          UGX          256  ProviderId_6  ProductId_10   
1  CustomerId_4406          UGX          256  ProviderId_4   ProductId_6   
2  CustomerId_4683          UGX          256  ProviderId_6   ProductId_1   
3   CustomerId_988          UGX          256  ProviderId_1  ProductId_21   
4   CustomerId_988          UGX          256  ProviderId_4   ProductId_6   

      ProductCategory    ChannelId   Amount  Value TransactionStartT

In [6]:
data_one_hot_encoded = one_hot_encode(data)
print(data_one_hot_encoded.head())


         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId     ProductId   Amount  Value TransactionStartTime  \
0  CustomerId_4406  ProductId_10   1000.0   1000  2018-11-15 02:18:49   
1  CustomerId_4406   ProductId_6    -20.0     20  2018-11-15 02:19:08   
2  CustomerId_4683   ProductId_1    500.0    500  2018-11-15 02:44:21   
3   CustomerId_988  ProductId_21  20000.0  21800  2018-11-15 03:32:55   
4   CustomerId_988   ProductId_6   -644.0    644  2018-11-15 03:34:21   

   PricingStrategy  ...  ChannelId_ChannelId_1  ChannelId_ChannelId_2  \
0            

In [7]:
data_label_encoded, encoders = label_encode(data, categorical_columns = ['CurrencyCode', 'CountryCode', 'ProductCategory', 'ChannelId', 'ProviderId'])
print(data_label_encoded.head())


         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId  CurrencyCode  CountryCode  ProviderId     ProductId  \
0  CustomerId_4406             0            0           5  ProductId_10   
1  CustomerId_4406             0            0           3   ProductId_6   
2  CustomerId_4683             0            0           5   ProductId_1   
3   CustomerId_988             0            0           0  ProductId_21   
4   CustomerId_988             0            0           3   ProductId_6   

   ProductCategory  ChannelId   Amount  Value TransactionStartTime  \
0   

In [8]:
data_filled = handle_missing_values(data)

In [9]:
data_normalized = normalize_features(data)

In [10]:
data_standardized = standardize_features(data)