In [2]:
#from myScripts import featureengineering_OOP_pipelines
import pandas as pd
import pandas as pd
import numpy as np
import datetime

from matplotlib import pyplot as plt
from functools import reduce
from sklearn import preprocessing
from sklearn.pipeline import FeatureUnion, Pipeline 
#from myScripts.PandasFeatureUnion import PandasFeatureUnion

### Custom Transformer Classes (with pipelines)

In [3]:
class simpleFormating():
    def __init__(self, path_to_analysis_dataset="Analysis_dataset.csv"):
        self.path_to_analysis_dataset = path_to_analysis_dataset
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        ## Drop unnamed column
        analysis_df = pd.read_csv(self.path_to_analysis_dataset).drop('Unnamed: 0', axis = 1)
        
        ## Convert signup and purchase times to pandas datetime
        analysis_df.signup_time = pd.to_datetime(analysis_df.signup_time, format = '%m/%d/%Y %H:%M')
        analysis_df.purchase_time = pd.to_datetime(analysis_df.purchase_time, format = '%m/%d/%Y %H:%M')
        
        ## Fill missing values with NA
        analysis_df = analysis_df.fillna('NA')
        return analysis_df
    
    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

analysis_df = simpleFormating().fit_transform("Analysis_dataset.csv")
analysis_df

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country
0,285108,2015-07-15 04:36:00,2015-09-10 14:17:00,31,HZAKVUFTDOSFD,Direct,Chrome,M,49,2.818400e+09,0,United States
1,131009,2015-01-24 12:29:00,2015-04-13 04:53:00,31,XGQAJSOUJIZCC,SEO,IE,F,21,3.251268e+09,0,United Kingdom
2,328855,2015-03-11 00:54:00,2015-04-05 12:23:00,16,VCCTAYDCWKZIY,Direct,IE,M,26,2.727760e+09,0,United States
3,229053,2015-01-07 13:19:00,2015-01-09 10:12:00,29,MFFIHYNXCJLEY,SEO,Chrome,M,34,2.083420e+09,0,Korea Republic of
4,108439,2015-02-08 21:11:00,2015-04-09 14:26:00,26,WMSXWGVPNIFBM,Ads,FireFox,M,33,3.207913e+09,0,Brazil
...,...,...,...,...,...,...,...,...,...,...,...,...
119995,173124,2015-01-22 05:04:00,2015-05-14 09:46:00,46,BKLQTHLSBBFNT,SEO,FireFox,M,24,3.825335e+09,0,
119996,167468,2015-07-02 21:05:00,2015-08-25 13:29:00,44,MQYOCEZHCTIIJ,SEO,Chrome,F,35,1.697438e+08,0,
119997,379065,2015-03-14 19:43:00,2015-05-06 15:09:00,48,TPVDXCUWUGJFV,Ads,Chrome,M,23,3.778213e+09,1,
119998,19916,2015-03-15 09:51:00,2015-05-23 01:37:00,24,KTXGKQGOVLTAR,SEO,Chrome,M,32,4.270132e+09,0,


In [25]:
### Calculate ratio of fraudulent transaction by each categorical variable
class calculateRatioFraud():
    def __init__(self, sel_var=None):
        self.sel_var = sel_var

    def fit(self,X,y=None):
        return self 

    def transform(self,X):
        # copy the original df to tmp
        tmp = X.copy()
        # group by the variable of interest (country) and class
        tmp = tmp.groupby([self.sel_var, 'class']).user_id.nunique()\
        .unstack(level = 1)\
        .reset_index()\
        .rename(columns = {0:'Not Fraud', 1: 'Fraud'}).fillna(0.0)     
        # create two new variables in tmp df
        tmp['ratio_fraud_' + self.sel_var] = tmp['Fraud']/(tmp['Fraud'] + tmp['Not Fraud'])
        tmp['num_trans_' + self.sel_var] = tmp['Fraud'] + tmp['Not Fraud']        
        return X[['user_id', self.sel_var]]\
            .merge(tmp[[self.sel_var, 'ratio_fraud_' + self.sel_var, 'num_trans_' + self.sel_var]], on = self.sel_var)
        
    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

fraud_by_country = calculateRatioFraud('country').fit_transform(analysis_df)
fraud_by_country


Unnamed: 0,user_id,country,ratio_fraud_country,num_trans_country
0,285108,United States,0.096830,46184.0
1,328855,United States,0.096830,46184.0
2,178528,United States,0.096830,46184.0
3,182104,United States,0.096830,46184.0
4,97677,United States,0.096830,46184.0
...,...,...,...,...
119995,173124,,0.085142,17418.0
119996,167468,,0.085142,17418.0
119997,379065,,0.085142,17418.0
119998,19916,,0.085142,17418.0


In [27]:
### Calculate time between sign up and purchase
class calculateTimeLatency():
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        X['purchase_time'] = pd.to_datetime(X['purchase_time'])
        X['signup_time'] = pd.to_datetime(X['signup_time'])
        X['time_latency'] = (X['purchase_time'] - X['signup_time']).dt.total_seconds()/60/60
        return X[['user_id','time_latency']]
    
    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)
    
latency_df = calculateTimeLatency().fit_transform(analysis_df)
latency_df

Unnamed: 0,user_id,time_latency
0,285108,1377.683333
1,131009,1888.400000
2,328855,611.483333
3,229053,44.883333
4,108439,1433.250000
...,...,...
119995,173124,2692.700000
119996,167468,1288.400000
119997,379065,1267.433333
119998,19916,1647.766667


In [21]:
### Encode multiple columns at once
class MultiColumnLabelEncoder:
    
    def __init__(self, columns = None):
        self.columns = columns 
        
    def fit(self,X,y=None):
        return self 

    def transform(self,X):
        '''
        Transform specified columns, if no specification, transform all columns
        '''
        output = X.copy()
        #output = output.apply(lambda col: preprocessing.LabelEncoder().\
        #fit_transform(col.astype(str)), axis=0, result_type='expand')
        if self.columns is not None:
            for col in self.columns:
                output[:,col] = preprocessing.LabelEncoder().fit_transform(output[:,col])
        else:
            for col in range(output.shape[1]):
                output[:,col] = preprocessing.LabelEncoder().fit_transform(output[:,col])
        
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)
    

In [22]:
### Select specified features
class subsetFeatures():
    def __init__(self, cols = ['user_id', 'purchase_value', 'class']):
        self.cols = cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.cols]
    
    def fit_transform(self, X, y=None):
        return self.fit(X,y).transform(X)


In [23]:
### Delete specified columns
class customSelector():
    
    def __init__(self, column):
        self.colnum = colnum
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return np.delete(X, self.colnum, axis=1)
    
    def fit_transform(self, X,y=None):
        return self.fit(X,y).transform(X)

### Feature Engineering Pipeline

In [26]:
pd.set_option('display.max_columns', None)

feature_names = ['user_id','sex','ratio_fraud_sex','num_trans_sex',
                       'device_id','ratio_fraud_device_id','num_trans_device_id',
                       'age','ratio_fraud_age', 'num_trans_age',
                       'country','ratio_fraud_country','num_trans_country',
                       'browser','ratio_fraud_browser','num_trans_browser',
                       'source', 'ratio_fraud_source','num_trans_source',
                       'time_latency','purchase_value','class']

PL = Pipeline([
    ('simple_formating',simpleFormating('Analysis_dataset.csv')),
    ('features1',FeatureUnion([
        ('fraud_by_sex',calculateRatioFraud('sex')),
        ('fraud_by_device_id',calculateRatioFraud('device_id')),
        ('fraud_by_age',calculateRatioFraud('age')),
        ('fraud_by_country',calculateRatioFraud('country')),
        ('fraud_by_browser',calculateRatioFraud('browser')),
        ('fraud_by_source',calculateRatioFraud('source')),
        ('time_latency', calculateTimeLatency()),
        ('subsets', subsetFeatures())
    ])),
    ('select_features',customSelector([4,8,12,16,20,24,26])),   # all columns before encoding
    ('encode', MultiColumnLabelEncoder([1,10,13,16]))          # 'country', 'sex', 'browser', 'source'
    #('to_feature_df', concatToFinalFeatures([1,10,13,16]))
])


product = PL.fit_transform('Analysis_dataset.csv') # 33 columns including user_id and device_id
feature_df = pd.DataFrame(product).set_axis(feature_names, axis=1, inplace=False).set_index(['user_id', 'device_id'])
feature_df = feature_df.astype('float')
feature_df

Unnamed: 0_level_0,Unnamed: 1_level_0,sex,ratio_fraud_sex,num_trans_sex,ratio_fraud_device_id,num_trans_device_id,age,ratio_fraud_age,num_trans_age,country,ratio_fraud_country,num_trans_country,browser,ratio_fraud_browser,num_trans_browser,source,ratio_fraud_source,num_trans_source,time_latency,purchase_value,class
user_id,device_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
285108,HZAKVUFTDOSFD,1.0,0.095442,70126.0,0.0,2.0,49.0,0.056534,1079.0,167.0,0.096830,46184.0,0.0,0.099441,48652.0,1.0,0.105643,24242.0,1377.683333,31.0,0.0
328855,HZAKVUFTDOSFD,1.0,0.095442,70126.0,0.0,2.0,49.0,0.056534,1079.0,167.0,0.096830,46184.0,0.0,0.099441,48652.0,1.0,0.105643,24242.0,1888.400000,31.0,0.0
229053,XGQAJSOUJIZCC,1.0,0.095442,70126.0,0.0,1.0,49.0,0.056534,1079.0,167.0,0.096830,46184.0,0.0,0.099441,48652.0,1.0,0.105643,24242.0,611.483333,16.0,0.0
108439,VCCTAYDCWKZIY,1.0,0.095442,70126.0,0.0,1.0,49.0,0.056534,1079.0,167.0,0.096830,46184.0,0.0,0.099441,48652.0,1.0,0.105643,24242.0,44.883333,29.0,0.0
70424,MFFIHYNXCJLEY,1.0,0.095442,70126.0,0.0,1.0,49.0,0.056534,1079.0,167.0,0.096830,46184.0,0.0,0.099441,48652.0,1.0,0.105643,24242.0,1433.250000,26.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224644,KRJEGKMJSDPUN,0.0,0.091671,49874.0,0.0,1.0,70.0,0.000000,1.0,113.0,0.085142,17418.0,3.0,0.091429,2975.0,0.0,0.091696,47461.0,2692.700000,46.0,0.0
399886,BKLQTHLSBBFNT,0.0,0.091671,49874.0,0.0,1.0,72.0,0.000000,2.0,113.0,0.085142,17418.0,3.0,0.091429,2975.0,0.0,0.091696,47461.0,1288.400000,44.0,0.0
389123,MQYOCEZHCTIIJ,0.0,0.091671,49874.0,0.0,1.0,72.0,0.000000,2.0,113.0,0.085142,17418.0,3.0,0.091429,2975.0,0.0,0.091696,47461.0,1267.433333,48.0,1.0
100771,KTXGKQGOVLTAR,0.0,0.091671,49874.0,0.0,1.0,76.0,0.000000,1.0,113.0,0.085142,17418.0,3.0,0.091429,2975.0,0.0,0.091696,47461.0,1647.766667,24.0,0.0


### Unused classes