## RabattCorner

In [1]:
from fastai.tabular import *

### Dataset

Read dataset

In [2]:
PATH='./'

In [3]:
table_names = ['train_starting2018jan']

In [4]:
tables = [pd.read_csv(f'{PATH}{fname}.csv', low_memory=False) for fname in table_names]

In [5]:
df = tables[0]

In [6]:
df.head()

Unnamed: 0,start_year,start_month,user_id,signup_year,signup_month,postal0,postal1,postal2,postal3,gender,language,birthday,partner_id,previous_visit_count,sum_amount
0,2018,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,7,2,0.0
1,2018,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,12,0,0.0
2,2018,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,16,1,0.0
3,2018,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,19,0,0.0
4,2018,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,20,10,0.0


### Remove NAN values

Next we'll fill in missing values to avoid complications with NA's. NA (not available) is how Pandas indicates missing values; many models have problems when missing values are present, so it's always important to think about how to deal with them. In these cases, we are picking an arbitrary signal value that doesn't otherwise appear in the data.

In [7]:
df['postal0'] = df.postal0.fillna(-1)
df['postal1'] = df.postal1.fillna(-1)
df['postal2'] = df.postal2.fillna(-1)
df['postal3'] = df.postal3.fillna(-1)
df['gender'] = df.gender.fillna(0)
df['birthday'] = df.birthday.fillna(1000)

In [8]:
df.isnull().values.any()

False

In [9]:
dep_var = 'sum_amount'
cat_names = ['start_year', 'start_month', 'user_id', 'signup_year', 'signup_month', 'postal0', 'postal1', 'postal2', 'postal3', 'gender', 'language', 'birthday', 'partner_id']
cont_names = ['previous_visit_count']
procs = [FillMissing, Categorify, Normalize]

In [10]:
len(df)

28385490

Validation set

In [11]:
df.iloc[24330420:26357955]

Unnamed: 0,start_year,start_month,user_id,signup_year,signup_month,postal0,postal1,postal2,postal3,gender,language,birthday,partner_id,previous_visit_count,sum_amount
24330420,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,7,2,0.0
24330421,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,12,0,0.0
24330422,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,16,1,0.0
24330423,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,19,0,0.0
24330424,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,20,11,0.0
24330425,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,26,0,0.0
24330426,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,31,0,0.0
24330427,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,32,0,0.0
24330428,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,34,2,0.0
24330429,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,46,0,0.0


Test set:

In [12]:
df.iloc[26357955:]

Unnamed: 0,start_year,start_month,user_id,signup_year,signup_month,postal0,postal1,postal2,postal3,gender,language,birthday,partner_id,previous_visit_count,sum_amount
26357955,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,7,2,0.0
26357956,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,12,0,0.0
26357957,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,16,1,0.0
26357958,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,19,0,0.0
26357959,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,20,11,0.0
26357960,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,26,0,0.0
26357961,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,31,0,0.0
26357962,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,32,0,0.0
26357963,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,34,2,0.0
26357964,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,46,0,0.0


In [13]:
val_idx = list(range(24330420, 26357955))

In [14]:
val_idx[0]

24330420

In [15]:
val_idx[-1]

26357954

In [16]:
df.iloc[val_idx]

Unnamed: 0,start_year,start_month,user_id,signup_year,signup_month,postal0,postal1,postal2,postal3,gender,language,birthday,partner_id,previous_visit_count,sum_amount
24330420,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,7,2,0.0
24330421,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,12,0,0.0
24330422,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,16,1,0.0
24330423,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,19,0,0.0
24330424,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,20,11,0.0
24330425,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,26,0,0.0
24330426,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,31,0,0.0
24330427,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,32,0,0.0
24330428,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,34,2,0.0
24330429,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,46,0,0.0


In [17]:
test = TabularList.from_df(df.iloc[26357955:].copy(), path=PATH, cat_names=cat_names, cont_names=cont_names)

In [18]:
data = (TabularList.from_df(df, path=PATH, cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .split_by_idx(val_idx)
                           .label_from_df(cols=dep_var)
                           .add_test(test)
                           .databunch())

In [19]:
data.show_batch(rows=10)

start_year,start_month,user_id,signup_year,signup_month,postal0,postal1,postal2,postal3,gender,language,birthday,partner_id,previous_visit_count,target
2018,8,11974,2016,12,-1.0,-1.0,-1.0,-1.0,2.0,1,1000.0,637,-0.0718,0.0
2018,5,12712,2017,1,8.0,3.0,7.0,2.0,1.0,1,1000.0,291,-0.0718,0.0
2018,10,8146,2016,3,5.0,0.0,2.0,3.0,2.0,1,1000.0,470,-0.0718,0.0
2018,1,26658,2018,1,-1.0,-1.0,-1.0,-1.0,0.0,1,1000.0,664,-0.0718,0.0
2019,2,4087,2015,3,6.0,3.0,7.0,0.0,2.0,1,1000.0,343,-0.0718,0.0
2018,5,16153,2017,4,8.0,1.0,2.0,5.0,2.0,1,1000.0,143,-0.0718,0.0
2018,8,7734,2016,2,4.0,1.0,5.0,3.0,1.0,1,1000.0,618,-0.0718,0.0
2018,6,19507,2017,6,8.0,8.0,5.0,6.0,1.0,1,1981.0,317,-0.0718,0.0
2018,11,2050,2013,11,8.0,3.0,6.0,0.0,1.0,1,1000.0,145,-0.0718,0.0
2018,11,21471,2017,11,-1.0,-1.0,-1.0,-1.0,0.0,1,1000.0,729,-0.0718,0.0


In [23]:
#learn = tabular_learner(data, layers=[200,100], metrics=accuracy)
learn = tabular_learner(data, layers=[200,100])

In [24]:
learn.fit(1, 1e-2)

epoch,train_loss,valid_loss,time
0,3.971291,11.784896,1:02:19


In [25]:
row = df.iloc[0]

In [26]:
row

start_year              2018.0
start_month                1.0
user_id                    1.0
signup_year             2012.0
signup_month              12.0
postal0                    8.0
postal1                    0.0
postal2                    5.0
postal3                    5.0
gender                     1.0
language                   1.0
birthday                1984.0
partner_id                 7.0
previous_visit_count       2.0
sum_amount                 0.0
Name: 0, dtype: float64

In [27]:
learn.predict(row)

(FloatItem [0.033429], tensor([0.0334]), tensor([0.0334]))

In [30]:
df.iloc[28384949]

start_year               2019.0
start_month                 2.0
user_id                 44146.0
signup_year              2018.0
signup_month               11.0
postal0                    -1.0
postal1                    -1.0
postal2                    -1.0
postal3                    -1.0
gender                      2.0
language                    1.0
birthday                 1000.0
partner_id                269.0
previous_visit_count        0.0
sum_amount                  0.0
Name: 28384949, dtype: float64

In [31]:
row = df.iloc[-1]

In [32]:
row

start_year               2019.0
start_month                 2.0
user_id                 44208.0
signup_year              2018.0
signup_month               11.0
postal0                     3.0
postal1                     8.0
postal2                     0.0
postal3                     0.0
gender                      1.0
language                    1.0
birthday                 1945.0
partner_id                869.0
previous_visit_count        0.0
sum_amount                  0.0
Name: 28385489, dtype: float64

In [33]:
learn.predict(row)

(FloatItem [0.065716], tensor([0.0657]), tensor([0.0657]))

In [41]:
row = df.iloc[28384946]

In [42]:
row

start_year               2019.00
start_month                 2.00
user_id                 44146.00
signup_year              2018.00
signup_month               11.00
postal0                    -1.00
postal1                    -1.00
postal2                    -1.00
postal3                    -1.00
gender                      2.00
language                    1.00
birthday                 1000.00
partner_id                261.00
previous_visit_count        0.00
sum_amount                 98.21
Name: 28384946, dtype: float64

In [43]:
learn.predict(row)

(FloatItem [-0.036665], tensor([-0.0367]), tensor([-0.0367]))

In [46]:
row = df.iloc[28353871]

In [47]:
row

start_year               2019.00
start_month                 2.00
user_id                 41551.00
signup_year              2018.00
signup_month               10.00
postal0                     8.00
postal1                     1.00
postal2                     7.00
postal3                     3.00
gender                      2.00
language                    1.00
birthday                 1966.00
partner_id                786.00
previous_visit_count        9.00
sum_amount                 25.93
Name: 28353871, dtype: float64

In [48]:
learn.predict(row)

(FloatItem [0.286134], tensor([0.2861]), tensor([0.2861]))

In [51]:
row = df.iloc[28026399]

In [52]:
row

start_year               2019.00
start_month                 2.00
user_id                 30548.00
signup_year              2018.00
signup_month                2.00
postal0                     3.00
postal1                     1.00
postal2                     2.00
postal3                     3.00
gender                      1.00
language                    1.00
birthday                 1000.00
partner_id                747.00
previous_visit_count        0.00
sum_amount                164.61
Name: 28026399, dtype: float64

In [53]:
learn.predict(row)

(FloatItem [0.193161], tensor([0.1932]), tensor([0.1932]))

In [54]:
row = df.iloc[28026398]

In [55]:
row

start_year               2019.0
start_month                 2.0
user_id                 30548.0
signup_year              2018.0
signup_month                2.0
postal0                     3.0
postal1                     1.0
postal2                     2.0
postal3                     3.0
gender                      1.0
language                    1.0
birthday                 1000.0
partner_id                744.0
previous_visit_count        0.0
sum_amount                  0.0
Name: 28026398, dtype: float64

In [56]:
learn.predict(row)

(FloatItem [0.013558], tensor([0.0136]), tensor([0.0136]))