# NOVO BANCO - Data Science Challenge
---
---

Part II( - In this part, we will deep dive on modelling the problem.

In [2]:
import catboost
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import FeatureHasher
from sklearn.compose import ColumnTransformer

import xlearn as xl

In [3]:
df = pd.read_csv("../data/featurized-data/part-00000-f7657991-b75e-4f7e-a808-d3635d9d5028-c000.csv").drop(columns=["_c0"])

In [4]:
df.shape

(500000, 38)

In [5]:
df.head()

Unnamed: 0,hour_of_day,place_category,banner_device,user,id,click,hour,C1,banner_pos,site_id,...,day_of_week,site_list,app_list,place_id,device_site,place_domain,banner_device_click_count,place_category_click_count,hour_of_day_click_count,user_hour_count
0,12,5378d02807d7df22,1,5b1faad59799c86d,7989800953858054144,0,2014-10-21T12:00:00.000+01:00,1005,0,c7fe2ca6,...,Tuesday,c7fe2ca6,ecad2386,c7fe2ca6ecad2386,a99f214ac7fe2ca6,c637fa9e7801e8d9,333118,4,27609,25121
1,12,50e219e009481d60,1,4f93c170,2477345577451911680,0,2014-10-28T12:00:00.000Z,1005,0,85f751fd,...,Tuesday,85f751fd,793e01a6,85f751fd793e01a6,4f93c17085f751fd,c4e18dd62347f47a,333118,700,27609,25121
2,12,50e219e009481d60,1,9f221a98,13014468333848920064,0,2014-10-21T12:00:00.000+01:00,1005,0,85f751fd,...,Tuesday,85f751fd,197b4f7f,85f751fd197b4f7f,9f221a9885f751fd,c4e18dd62347f47a,333118,700,27609,25121
3,12,50e219e009481d60,1,f60c2fd3af62faf4,15212394161559316480,0,2014-10-21T12:00:00.000+01:00,1005,0,85f751fd,...,Tuesday,"85f751fd,57fe1b20,85f751fd,a7853007,a7853007,a...","7d1d81cf,ecad2386,de97da65,ecad2386,ecad2386,e...",85f751fd7d1d81cf,a99f214a85f751fd,c4e18dd633da2e74,333118,700,27609,25121
4,12,50e219e009481d60,1,3bb1ddd76a31c752,4842744885012688896,0,2014-10-21T12:00:00.000+01:00,1005,0,85f751fd,...,Tuesday,"85f751fd,85f751fd,85f751fd,85f751fd,85f751fd,8...","3f2a6cbb,3f2a6cbb,3f2a6cbb,3f2a6cbb,7d1d81cf,3...",85f751fd7d1d81cf,a99f214a85f751fd,c4e18dd633da2e74,333118,700,27609,25121


In [7]:
df = df.astype(str)

In [8]:
df.dtypes

hour_of_day                   object
place_category                object
banner_device                 object
user                          object
id                            object
click                         object
hour                          object
C1                            object
banner_pos                    object
site_id                       object
site_domain                   object
site_category                 object
app_id                        object
app_domain                    object
app_category                  object
device_id                     object
device_ip                     object
device_model                  object
device_type                   object
device_conn_type              object
C14                           object
C15                           object
C16                           object
C17                           object
C18                           object
C19                           object
C20                           object
C

In [9]:
Y = df["click"]
X = df.drop(columns="click")

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [11]:
hasher = FeatureHasher(n_features=250, input_type='string')

In [12]:
n_orig_features = X.shape[1]
hash_vector_size = 10
ct = ColumnTransformer([(f't_{i}', FeatureHasher(n_features=hash_vector_size, 
                        input_type='string'), i) for i in range(n_orig_features)])

In [14]:
X_train_hashed = ct.fit_transform(X_train)

X_val_hashed = ct.transform(X_val)
X_test_hashed = ct.transform(X_test)

In [15]:
X_train_df = pd.DataFrame(data=X_train_hashed)
X_val_df = pd.DataFrame(data=X_val_hashed)
X_test_df = pd.DataFrame(data=X_test_hashed)

In [24]:
print(X_val_df.shape)
print(y_val.shape)

(100000, 370)
(100000,)


In [27]:
def transform_to_libffm_format(df, label, set_name):
    with open('../data/hashed_data/{0}_ffm.txt'.format(set_name) , 'w') as f:
        for row in range(df.shape[0] - 1):
            print(row)
            output = ''
            x_data = df.iloc[row].to_dict()
            y_data = label.iloc[row]
            output += str(int(y_data))
            for idx, key in enumerate(x_data.keys()):
                output += " " + str(idx) + ':' + str(idx) + ":" + str(x_data[key])     

            output += '\n'
            f.write(output)

In [None]:
transform_to_libffm_format(X_train_df, y_train, "train")
transform_to_libffm_format(X_val_df, y_val, "validation")
transform_to_libffm_format(X_test_df, y_test, "test")

In [None]:
ffm_model = xl.create_ffm()

ffm_model.setTrain('../data/hashed_data/train_ffm.txt')
ffm_model.setValidate('../data/hashed_data/val_ffm.txt')

param = {'task':'binary', 'lr':0.2, 'lambda':0.002, 'opt':'adagrad'}

ffm_model.fit(param, "./model.out")

```
----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[------------] xLearn uses 4 threads for training task.
[ ACTION     ] Read Problem ...
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (../data/hashed_data/train_ffm.txt.bin) NOT found. Convert text file to binary file.
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (../data/hashed_data/validation_ffm.txt.bin) NOT found. Convert text file to binary file.
[------------] Number of Feature: 370
[------------] Number of Field: 370
[------------] Time cost for reading problem: 24.29 (sec)
[ ACTION     ] Initialize model ...
[------------] Model size: 4.18 MB
[------------] Time cost for model initial: 0.01 (sec)
[ ACTION     ] Start to train ...
[------------] Epoch      Train log_loss       Test log_loss     Time cost (sec)
[   10%      ]     1            0.447161            0.442559              378.41
[   20%      ]     2            0.446667            0.443344              786.56
[   30%      ]     3            0.446627            0.442743              787.46
[   40%      ]     4            0.446607            0.442987              790.77
[   50%      ]     5            0.446530            0.443004              781.02
[   60%      ]     6            0.446582            0.443627              797.63
[ ACTION     ] Early-stopping at epoch 1, best loss: 0.442559
[ ACTION     ] Start to save model ...
[------------] Model file: ./model.out
[------------] Time cost for saving model: 0.01 (sec)
[ ACTION     ] Finish training
[ ACTION     ] Clear the xLearn environment ...
[------------] Total time cost: 4346.19 (sec)
```                        

In [None]:
ffm_model.cv(param)

```
----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[ WARNING    ] Cross-validation doesn't support early-stopping. xLearn has already close early-stopping.
[ WARNING    ] The --cv (cross-validation) has been set, and xLearn will not dump model checkpoint to disk.
[------------] xLearn uses 4 threads for training task.
[ ACTION     ] Read Problem ...
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (../data/hashed_data/train_ffm.txt_0.bin) NOT found. Convert text file to binary file.
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (../data/hashed_data/train_ffm.txt_1.bin) NOT found. Convert text file to binary file.
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (../data/hashed_data/train_ffm.txt_2.bin) NOT found. Convert text file to binary file.
[------------] Number of Feature: 370
[------------] Number of Field: 370
[------------] Time cost for reading problem: 24.44 (sec)
[ ACTION     ] Initialize model ...
[------------] Model size: 4.18 MB
[------------] Time cost for model initial: 0.01 (sec)
[ ACTION     ] Start to train ...
[ ACTION     ] Cross-validation: 1/3:
[------------] Epoch      Train log_loss       Test log_loss     Time cost (sec)
[   10%      ]     1            0.448196            0.445102               98.68
[   20%      ]     2            0.447608            0.444641              529.65
[   30%      ]     3            0.447557            0.444740              541.95
[   40%      ]     4            0.447543            0.445519              539.96
[   50%      ]     5            0.447465            0.444939              526.43
[   60%      ]     6            0.447491            0.444660              519.66
[   70%      ]     7            0.447424            0.444868              551.03
[   80%      ]     8            0.447436            0.444594              523.23
[   90%      ]     9            0.447461            0.444767              517.59
[  100%      ]    10            0.447436            0.444678              535.04
[ ACTION     ] Cross-validation: 2/3:
[------------] Epoch      Train log_loss       Test log_loss     Time cost (sec)
[   10%      ]     1            0.446959            0.447511              105.78
[   20%      ]     2            0.446394            0.447596              558.62
[   30%      ]     3            0.446346            0.447117              542.80
[   40%      ]     4            0.446242            0.447270              518.49
[   50%      ]     5            0.446251            0.447451              518.06
[   60%      ]     6            0.446201            0.447306              521.50
[   70%      ]     7            0.446196            0.447400              516.16
[   80%      ]     8            0.446211            0.447320              515.31
[   90%      ]     9            0.446224            0.447440              533.32
[  100%      ]    10            0.446274            0.444540              543.43
[ ACTION     ] Cross-validation: 3/3:
[------------] Epoch      Train log_loss       Test log_loss     Time cost (sec)
[   10%      ]     1            0.448939            0.447421              102.78
[   20%      ]     2            0.446645            0.447236              528.62
[   30%      ]     3            0.446449            0.447117              518.46
[   40%      ]     4            0.446542            0.447420              518.68
[   50%      ]     5            0.446551            0.447233              518.12
[   60%      ]     6            0.446332            0.447765              541.50
[   70%      ]     7            0.446139            0.447430              546.16
[   80%      ]     8            0.446121            0.447220              533.24
[   90%      ]     9            0.446321            0.447343              512.76
[  100%      ]    10            0.446124            0.444324              543.35
[------------] Average log_loss: 0.446967
[ ACTION     ] Finish Cross-Validation
[ ACTION     ] Clear the xLearn environment ...
[------------] Total time cost: 15287 (sec)

```

In [4]:
ffm_model.setTest("../data/hashed_data/test_ffm.txt")
ffm_model.predict("../notebooks/model/model.out", "../data/hashed_data/output.txt")

```
----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[------------] xLearn uses 4 threads for prediction task.
[ ACTION     ] Load model ...
[------------] Load model from ../notebooks/model/model.out
[------------] Loss function: cross-entropy
[------------] Score function: ffm
[------------] Number of Feature: 370
[------------] Number of K: 4
[------------] Number of field: 370
[------------] Time cost for loading model: 0.01 (sec)
[ ACTION     ] Read Problem ...
[------------] First check if the text file has been already converted to binary format.
[------------] Binary file (../data/hashed_data/test_ffm.txt.bin) NOT found. Convert text file to binary file.
[------------] Time cost for reading problem: 6.40 (sec)
[ ACTION     ] Start to predict ...
[------------] The test loss is: 0.447106
[ ACTION     ] Clear the xLearn environment ...
[------------] Total time cost: 12.54 (sec)
```