In [1]:
import pandas as pd
import numpy as np

# sklearn
from sklearn import metrics
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, KFold

import warnings
warnings.filterwarnings('ignore')

#### Using original Train data

In [5]:
df = pd.read_csv('../data/Train.csv')
df_test = pd.read_csv('../data/Test.csv')
sample_submission = pd.read_csv('../data/SampleSubmission.csv')
df.head(2)


Unnamed: 0,MERCHANT_CATEGORIZED_AT,MERCHANT_NAME,MERCHANT_CATEGORIZED_AS,PURCHASE_VALUE,PURCHASED_AT,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY,USER_AGE,USER_GENDER,USER_HOUSEHOLD,USER_INCOME,USER_ID,Transaction_ID
0,2022-05-04 10:25:50.588042+00,UONSDA CHURCH MOGERE MWAYO TITHE,Rent / Mortgage,1700,2022-05-04 13:56:00+00,False,,Male,3,10000,ID_ZX4DCF4K,ID_04mk78fa
1,2021-10-25 16:18:38.586837+00,PARK N GO,Transport & Fuel,100,2021-10-24 14:12:00+00,False,25.0,Female,4,90000,ID_U9WZMGJZ,ID_04xkfb07


In [6]:
df_test.head(2)

Unnamed: 0,MERCHANT_CATEGORIZED_AT,MERCHANT_NAME,PURCHASE_VALUE,PURCHASED_AT,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY,USER_AGE,USER_GENDER,USER_HOUSEHOLD,USER_INCOME,USER_ID,Transaction_ID
0,2022-06-01 10:25:16.7131+00,KCB PAYBILL AC,150000,2022-05-05 08:29:00+00,True,,Male,5,150000,ID_O8P8YS18,ID_00x9h2yx
1,2022-03-16 13:05:51.851102+00,IPAY LTD,7394,2019-10-05 16:02:00+00,False,,Female,1,10000,ID_40L9OTIM,ID_01db594f


In [7]:
sample_submission.head(2)

Unnamed: 0,Transaction_ID,Bills & Fees,Data & WiFi,Education,Emergency fund,Family & Friends,Going out,Groceries,Health,Loan Repayment,Miscellaneous,Rent / Mortgage,Shopping,Transport & Fuel
0,ID_00x9h2yx,0,0,0,0,0,0,0,0,0,0,0,0,0
1,ID_01db594f,0,0,0,0,0,0,0,0,0,0,0,0,0


##### Dealing with missing values.
Recall there are some missing values in the Train csv file. 


In [8]:
# We'll fill the missing values with the median just as we did in the eda notebook.
df_median = np.median(df['USER_AGE'].dropna())
df['USER_AGE'] = df['USER_AGE'].fillna(df_median)

dftest_median = np.median(df_test['USER_AGE'].dropna())
df_test['USER_AGE'] = df_test['USER_AGE'].fillna(dftest_median)

In [9]:
# dealing with missing values for USER_GENDER
# We'll fill in these values using the MALE gender as it is more common
df["USER_GENDER"] = df["USER_GENDER"].apply(lambda x: "Male" if pd.isna(x) else x)
df_test["USER_GENDER"] = df_test["USER_GENDER"].apply(lambda x: "Male" if pd.isna(x) else x)

In [10]:
use_cols = [c for c in df.columns if c not in ('MERCHANT_CATEGORIZED_AS', 'kfold', 'USER_ID', 'Transaction_ID', 'PURCHASED_AT', 'MERCHANT_CATEGORIZED_AT')]
obj = [c for c in use_cols if df[c].dtype == 'object']
num = ['USER_INCOME', 'PURCHASE_VALUE']


In [11]:
y = df.MERCHANT_CATEGORIZED_AS
X = df.drop('MERCHANT_CATEGORIZED_AS', axis=1)

In [12]:
# label encoding
le = LabelEncoder()
y = le.fit_transform(y)

In [13]:
le.classes_

array(['Bills & Fees', 'Data & WiFi', 'Education', 'Emergency fund',
       'Family & Friends', 'Going out', 'Groceries', 'Health',
       'Loan Repayment', 'Miscellaneous', 'Rent / Mortgage', 'Shopping',
       'Transport & Fuel'], dtype=object)

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1)


In [15]:
x_train = X_train[use_cols].copy()
x_valid = X_valid[use_cols].copy()

In [16]:
x_train.head(2)

Unnamed: 0,MERCHANT_NAME,PURCHASE_VALUE,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY,USER_AGE,USER_GENDER,USER_HOUSEHOLD,USER_INCOME
140,FATUMA OSORE,90,False,25.0,Male,3,10000
303,FAMILY BANK PESA PAP,10000,True,25.0,Male,3,200000


function for comparing all different approaches we'll use.

In [17]:
def score_dataset(x_train, x_valid, y_train, y_valid):
    # train
    model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs = -1)
    model.fit(x_train,y_train)
    preds = model.predict(x_valid)
    print(metrics.log_loss(y_valid, preds, labels=le.classes_))
    return metrics.log_loss(y_valid, preds)
    

In [27]:
# copying the data to avoid changing the data
xtrain = x_train.copy()
xvalid = x_valid.copy()

# Applying ordinal encoder on each categorical column
ord_enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=9999)
xtrain[obj] = ord_enc.fit_transform(xtrain[obj])
xvalid[obj] = ord_enc.transform(xvalid[obj])

# get a score
# print(score_dataset(xtrain, xvalid, y_train, y_valid))

In [28]:
model = RandomForestClassifier()
model.fit(xtrain, y_train)
preds = model.predict_proba(xvalid)
print(metrics.log_loss(y_valid, preds, labels=le.classes_))

0.0


In [30]:
# applying model above to our test
df_test = df_test[use_cols].copy()
# df_test[obj] = ord_enc.transform(df_test[obj])
test_preds = model.predict_proba(df_test)
test_preds

array([[0.19 , 0.01 , 0.01 , ..., 0.   , 0.05 , 0.   ],
       [0.15 , 0.   , 0.   , ..., 0.   , 0.01 , 0.   ],
       [0.04 , 0.   , 0.   , ..., 0.   , 0.04 , 0.075],
       ...,
       [0.19 , 0.15 , 0.   , ..., 0.   , 0.   , 0.17 ],
       [0.34 , 0.   , 0.   , ..., 0.01 , 0.01 , 0.   ],
       [0.   , 0.24 , 0.   , ..., 0.   , 0.04 , 0.01 ]])

In [31]:
sample_submission[['Bills & Fees', 'Data & WiFi', 'Education',
       'Emergency fund', 'Family & Friends', 'Going out', 'Groceries',
       'Health', 'Loan Repayment', 'Miscellaneous', 'Rent / Mortgage',
       'Shopping', 'Transport & Fuel']]= test_preds

In [32]:
sample_submission.to_csv('../data/submissions/ss_baseline_org.csv', index=False)

Our model with no feature engineering got a score of 3.546501152092195