## Initial imports

In [9]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [10]:
from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)

PATH='/home/paperspace/data/rc/'

## Dataset

In [11]:
table_names = ['train', 'validation']

In [12]:
tables = [pd.read_csv(f'{PATH}{fname}.csv', low_memory=False) for fname in table_names]

In [13]:
from IPython.display import HTML, display

In [14]:
for t in tables: display(t.head())

Unnamed: 0,start_year,start_month,user_id,signup_year,signup_month,postal0,postal1,postal2,postal3,gender,language,birthday,partner_id,previous_visit_count,sum_amount
0,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,7,2,0.0
1,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,12,0,0.0
2,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,16,1,0.0
3,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,19,0,0.0
4,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,20,8,0.0


Unnamed: 0,start_year,start_month,user_id,signup_year,signup_month,postal0,postal1,postal2,postal3,gender,language,birthday,partner_id,previous_visit_count,sum_amount
0,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,7,2,0.0
1,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,12,0,0.0
2,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,16,1,0.0
3,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,19,0,0.0
4,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,20,11,0.0


In [15]:
train, test = tables

In [16]:
train.head(5)

Unnamed: 0,start_year,start_month,user_id,signup_year,signup_month,postal0,postal1,postal2,postal3,gender,language,birthday,partner_id,previous_visit_count,sum_amount
0,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,7,2,0.0
1,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,12,0,0.0
2,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,16,1,0.0
3,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,19,0,0.0
4,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,20,8,0.0


In [18]:
train.tail(5)

Unnamed: 0,start_year,start_month,user_id,signup_year,signup_month,postal0,postal1,postal2,postal3,gender,language,birthday,partner_id,previous_visit_count,sum_amount
48660835,2018,12,44208,2018,11,3.0,8.0,0.0,0.0,1.0,1,1945.0,861,0,0.0
48660836,2018,12,44208,2018,11,3.0,8.0,0.0,0.0,1.0,1,1945.0,862,0,0.0
48660837,2018,12,44208,2018,11,3.0,8.0,0.0,0.0,1.0,1,1945.0,864,0,0.0
48660838,2018,12,44208,2018,11,3.0,8.0,0.0,0.0,1.0,1,1945.0,868,0,0.0
48660839,2018,12,44208,2018,11,3.0,8.0,0.0,0.0,1.0,1,1945.0,869,0,0.0


In [19]:
test.head(5)

Unnamed: 0,start_year,start_month,user_id,signup_year,signup_month,postal0,postal1,postal2,postal3,gender,language,birthday,partner_id,previous_visit_count,sum_amount
0,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,7,2,0.0
1,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,12,0,0.0
2,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,16,1,0.0
3,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,19,0,0.0
4,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,20,11,0.0


## Remove NAN values

Next we'll fill in missing values to avoid complications with NA's. NA (not available) is how Pandas indicates missing values; many models have problems when missing values are present, so it's always important to think about how to deal with them. In these cases, we are picking an arbitrary signal value that doesn't otherwise appear in the data.

In [21]:
for df in (test, train):
    df['postal0'] = df.postal0.fillna(-1)
    df['postal1'] = df.postal1.fillna(-1)
    df['postal2'] = df.postal2.fillna(-1)
    df['postal3'] = df.postal3.fillna(-1)
    df['gender'] = df.gender.fillna(0)
    df['birthday'] = df.birthday.fillna(1000)

In [22]:
test.isnull().values.any()

False

In [23]:
train.isnull().values.any()

False

In [24]:
cat_vars = ['start_year', 'start_month', 'user_id', 'signup_year', 'signup_month', 'postal0', 'postal1', 'postal2', 'postal3', 'gender', 'language', 'birthday', 'partner_id']

contin_vars = ['previous_visit_count']

In [25]:
dep = 'sum_amount'

In [26]:
for v in cat_vars: train[v] = train[v].astype('category').cat.as_ordered()

In [27]:
apply_cats(test, train)

In [28]:
for v in contin_vars:
    train[v] = train[v].fillna(0).astype('float32')
    test[v] = test[v].fillna(0).astype('float32')

In [29]:
train.head()

Unnamed: 0,start_year,start_month,user_id,signup_year,signup_month,postal0,postal1,postal2,postal3,gender,language,birthday,partner_id,previous_visit_count,sum_amount
0,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,7,2.0,0.0
1,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,12,0.0,0.0
2,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,16,1.0,0.0
3,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,19,0.0,0.0
4,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,20,8.0,0.0


In [30]:
n = len(train); n

48660840

Run this to run on a sample:

In [31]:
#idxs = get_cv_idxs(n, val_pct=150000/n)
#train_samp = train.iloc[idxs]
#samp_size = len(train_samp); samp_size

Run this to run on the full dataset:

In [32]:
#samp_size = n
#train_samp = train

In [33]:
#train_samp.head(5)

In [34]:
df, y, nas, mapper = proc_df(train, 'sum_amount', do_scale=True)
yl = np.log(y)

  


In [35]:
df_test, _, nas, mapper = proc_df(test, 'sum_amount', do_scale=True,
                                  mapper=mapper, na_dict=nas)

In [36]:
df.head(5)

Unnamed: 0,start_year,start_month,user_id,signup_year,signup_month,postal0,postal1,postal2,postal3,gender,language,birthday,partner_id,previous_visit_count
0,1,1,1,1,12,9,2,7,7,2,1,48,1,1.853765
1,1,1,1,1,12,9,2,7,7,2,1,48,2,-0.059458
2,1,1,1,1,12,9,2,7,7,2,1,48,3,0.897153
3,1,1,1,1,12,9,2,7,7,2,1,48,4,-0.059458
4,1,1,1,1,12,9,2,7,7,2,1,48,5,7.593433


In [37]:
df_test.head(5)

Unnamed: 0,start_year,start_month,user_id,signup_year,signup_month,postal0,postal1,postal2,postal3,gender,language,birthday,partner_id,previous_visit_count
0,0,1,1,1,12,9,2,7,7,2,1,48,1,1.853765
1,0,1,1,1,12,9,2,7,7,2,1,48,2,-0.059458
2,0,1,1,1,12,9,2,7,7,2,1,48,3,0.897153
3,0,1,1,1,12,9,2,7,7,2,1,48,4,-0.059458
4,0,1,1,1,12,9,2,7,7,2,1,48,5,10.463267


Validation set indexes

In [45]:
#train_ratio = 0.75
# train_ratio = 0.9
#train_size = int(samp_size * train_ratio); train_size
#val_idx = list(range(train_size, len(df)))


train_size = len(train) - len(test)
val_idx = list(range(train_size, len(df)))

In [46]:
val_idx

[46633305,
 46633306,
 46633307,
 46633308,
 46633309,
 46633310,
 46633311,
 46633312,
 46633313,
 46633314,
 46633315,
 46633316,
 46633317,
 46633318,
 46633319,
 46633320,
 46633321,
 46633322,
 46633323,
 46633324,
 46633325,
 46633326,
 46633327,
 46633328,
 46633329,
 46633330,
 46633331,
 46633332,
 46633333,
 46633334,
 46633335,
 46633336,
 46633337,
 46633338,
 46633339,
 46633340,
 46633341,
 46633342,
 46633343,
 46633344,
 46633345,
 46633346,
 46633347,
 46633348,
 46633349,
 46633350,
 46633351,
 46633352,
 46633353,
 46633354,
 46633355,
 46633356,
 46633357,
 46633358,
 46633359,
 46633360,
 46633361,
 46633362,
 46633363,
 46633364,
 46633365,
 46633366,
 46633367,
 46633368,
 46633369,
 46633370,
 46633371,
 46633372,
 46633373,
 46633374,
 46633375,
 46633376,
 46633377,
 46633378,
 46633379,
 46633380,
 46633381,
 46633382,
 46633383,
 46633384,
 46633385,
 46633386,
 46633387,
 46633388,
 46633389,
 46633390,
 46633391,
 46633392,
 46633393,
 46633394,
 46633395,

## DL

We're ready to put together our models.

Root-mean-squared percent error is the metric Kaggle used for this competition.

In [28]:
def inv_y(a): return np.exp(a)

def exp_rmspe(y_pred, targ):
    targ = inv_y(targ)
    pct_var = (targ - inv_y(y_pred))/targ
    return math.sqrt((pct_var**2).mean())

max_log_y = np.max(yl)
y_range = (0, max_log_y*1.2)