In [1]:
# !pip install xgboost

In [2]:
import pandas as pd
import numpy as np

# sklearn
from sklearn import metrics
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, KFold

# models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# misc
import os
import math
import gc

In [14]:
# read in the data
train = pd.read_csv('../data/train_folds.csv')
extra_train = pd.read_csv('../data/extra_data.csv')
test = pd.read_csv('../data/test_eda.csv')
sample_submission = pd.read_csv('../data/SampleSubmission.csv')

In [15]:
test.drop('Unnamed: 0', axis=1, inplace=True)

In [16]:
train.drop(['Unnamed: 0'], axis=1, inplace=True)

In [17]:
train.head(2)

Unnamed: 0,MERCHANT_CATEGORIZED_AT,MERCHANT_NAME,MERCHANT_CATEGORIZED_AS,PURCHASE_VALUE,PURCHASED_AT,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY,USER_AGE,USER_GENDER,USER_HOUSEHOLD,USER_INCOME,USER_ID,Transaction_ID,PURCHASEDAT_MONTH,PURCHASEDAT_DAY,PURCHASEDAT_HOUR,PURCHASEDAT_MINUTE,PURCHASEDAT_DAYOFWEEK,PAID_AT,kfold
0,2022-05-04 10:25:50.588042+00:00,UONSDA CHURCH MOGERE MWAYO TITHE,Rent / Mortgage,1700,2022-05-04 13:56:00+00:00,False,25,Male,3,10000,ID_ZX4DCF4K,ID_04mk78fa,5,4,13,56,2,afternoon,1
1,2021-10-25 16:18:38.586837+00:00,PARK N GO,Transport & Fuel,100,2021-10-24 14:12:00+00:00,False,25,Female,4,90000,ID_U9WZMGJZ,ID_04xkfb07,10,24,14,12,6,afternoon,9


In [18]:
test.head(2)

Unnamed: 0,MERCHANT_CATEGORIZED_AT,MERCHANT_NAME,PURCHASE_VALUE,PURCHASED_AT,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY,USER_AGE,USER_GENDER,USER_HOUSEHOLD,USER_INCOME,USER_ID,Transaction_ID
0,2022-06-01 10:25:16.7131+00,KCB PAYBILL AC,150000,2022-05-05 08:29:00+00,True,,Male,5,150000,ID_O8P8YS18,ID_00x9h2yx
1,2022-03-16 13:05:51.851102+00,IPAY LTD,7394,2019-10-05 16:02:00+00,False,,Female,1,10000,ID_40L9OTIM,ID_01db594f


In [20]:
train[train.loc[:, 'MERCHANT_NAME'] == 'IPAY LTD']

Unnamed: 0,MERCHANT_CATEGORIZED_AT,MERCHANT_NAME,MERCHANT_CATEGORIZED_AS,PURCHASE_VALUE,PURCHASED_AT,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY,USER_AGE,USER_GENDER,USER_HOUSEHOLD,USER_INCOME,USER_ID,Transaction_ID,PURCHASEDAT_MONTH,PURCHASEDAT_DAY,PURCHASEDAT_HOUR,PURCHASEDAT_MINUTE,PURCHASEDAT_DAYOFWEEK,PAID_AT,kfold
64,2022-03-16 13:08:19.703288+00:00,IPAY LTD,Going out,7394,2019-10-05 16:02:00+00:00,False,25,Female,1,10000,ID_40L9OTIM,ID_71xbqh8j,10,5,16,2,5,afternoon,8


In [23]:
# create this new merchant data
# investigating the relationship between merchant name and what the merchant is categorized,
#  separated by most durations at which most purchases are made,
merchants_data = train.groupby(['MERCHANT_NAME'], as_index=False)[['MERCHANT_CATEGORIZED_AS', 'PAID_AT', 'PURCHASE_VALUE']].agg(
                                                {'MERCHANT_CATEGORIZED_AS':'min',
                                                 'PAID_AT':'max', 
                                                 'PURCHASE_VALUE':'mean'})

In [24]:
train.head()

Unnamed: 0,MERCHANT_CATEGORIZED_AT,MERCHANT_NAME,MERCHANT_CATEGORIZED_AS,PURCHASE_VALUE,PURCHASED_AT,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY,USER_AGE,USER_GENDER,USER_HOUSEHOLD,USER_INCOME,USER_ID,Transaction_ID,PURCHASEDAT_MONTH,PURCHASEDAT_DAY,PURCHASEDAT_HOUR,PURCHASEDAT_MINUTE,PURCHASEDAT_DAYOFWEEK,PAID_AT,kfold
0,2022-05-04 10:25:50.588042+00:00,UONSDA CHURCH MOGERE MWAYO TITHE,Rent / Mortgage,1700,2022-05-04 13:56:00+00:00,False,25,Male,3,10000,ID_ZX4DCF4K,ID_04mk78fa,5,4,13,56,2,afternoon,1
1,2021-10-25 16:18:38.586837+00:00,PARK N GO,Transport & Fuel,100,2021-10-24 14:12:00+00:00,False,25,Female,4,90000,ID_U9WZMGJZ,ID_04xkfb07,10,24,14,12,6,afternoon,9
2,2022-05-20 14:17:30.917297+00:00,SAFARICOM OFFERS TUNUKIWA,Data & WiFi,20,2022-05-27 12:46:00+00:00,False,25,Male,3,10000,ID_ZX4DCF4K,ID_051urwuw,5,27,12,46,4,afternoon,6
3,2022-04-12 15:59:14.139347+00:00,ZILLIONS CREDIT LIMITED,Loan Repayment,3800,2022-02-07 12:16:00+00:00,False,25,Male,5,252000,ID_3JA0MAFB,ID_0e1om7rz,2,7,12,16,0,afternoon,8
4,2022-05-04 10:24:26.709045+00:00,MICHAEL MUSEMBI,Bills & Fees,60,2022-05-04 15:28:00+00:00,False,25,Male,3,10000,ID_ZX4DCF4K,ID_0kfcoawb,5,4,15,28,2,afternoon,2


In [None]:
# creating age and family relationships