## Initial imports

In [3]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [4]:
from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)

PATH='/home/paperspace/data/rc/'

## Dataset

In [5]:
table_names = ['test', 'train', 'validation']

In [6]:
tables = [pd.read_csv(f'{PATH}{fname}.csv', low_memory=False) for fname in table_names]

In [7]:
from IPython.display import HTML, display

In [8]:
for t in tables: display(t.head())

Unnamed: 0,start_year,start_month,user_id,signup_year,signup_month,postal0,postal1,postal2,postal3,gender,language,birthday,partner_id,previous_visit_count,sum_amount
0,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,7,2,0.0
1,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,12,0,0.0
2,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,16,1,0.0
3,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,19,0,0.0
4,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,20,11,0.0


Unnamed: 0,start_year,start_month,user_id,signup_year,signup_month,postal0,postal1,postal2,postal3,gender,language,birthday,partner_id,previous_visit_count,sum_amount
0,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,7,2,0.0
1,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,12,0,0.0
2,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,16,1,0.0
3,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,19,0,0.0
4,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,20,8,0.0


Unnamed: 0,start_year,start_month,user_id,signup_year,signup_month,postal0,postal1,postal2,postal3,gender,language,birthday,partner_id,previous_visit_count,sum_amount
0,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,7,2,0.0
1,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,12,0,0.0
2,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,16,1,0.0
3,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,19,0,0.0
4,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,20,11,0.0


In [9]:
test, train, validation = tables

### Remove NAN values

Next we'll fill in missing values to avoid complications with NA's. NA (not available) is how Pandas indicates missing values; many models have problems when missing values are present, so it's always important to think about how to deal with them. In these cases, we are picking an arbitrary signal value that doesn't otherwise appear in the data.

In [10]:
for df in (test, train, validation):
    df['postal0'] = df.postal0.fillna(-1)
    df['postal1'] = df.postal1.fillna(-1)
    df['postal2'] = df.postal2.fillna(-1)
    df['postal3'] = df.postal3.fillna(-1)
    df['gender'] = df.gender.fillna(0)
    df['birthday'] = df.birthday.fillna(1000)

In [11]:
test.isnull().values.any()

False

In [12]:
train.isnull().values.any()

False

In [14]:
validation.isnull().values.any()

False

In [15]:
cat_vars = ['start_year', 'start_month', 'user_id', 'signup_year', 'signup_month', 'postal0', 'postal1', 'postal2', 'postal3', 'gender', 'language', 'birthday', 'partner_id']

contin_vars = ['previous_visit_count']

In [16]:
dep = 'sum_amount'

In [17]:
for v in cat_vars: train[v] = train[v].astype('category').cat.as_ordered()

In [18]:
apply_cats(test, train)

In [19]:
for v in contin_vars:
    train[v] = train[v].fillna(0).astype('float32')
    test[v] = test[v].fillna(0).astype('float32')

In [20]:
train.head()

Unnamed: 0,start_year,start_month,user_id,signup_year,signup_month,postal0,postal1,postal2,postal3,gender,language,birthday,partner_id,previous_visit_count,sum_amount
0,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,7,2.0,0.0
1,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,12,0.0,0.0
2,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,16,1.0,0.0
3,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,19,0.0,0.0
4,2017,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,20,8.0,0.0


In [22]:
n = len(train); n

48660840

Run this to run on a sample:

In [25]:
idxs = get_cv_idxs(n, val_pct=150000/n)
train_samp = train.iloc[idxs]
samp_size = len(train_samp); samp_size

150000

Run this to run on the full dataset:

In [None]:
samp_size = n
train_samp = train

In [30]:
train_samp.head(2)

Unnamed: 0,start_year,start_month,user_id,signup_year,signup_month,postal0,postal1,postal2,postal3,gender,language,birthday,partner_id,previous_visit_count,sum_amount
41924962,2018,9,23091,2017,11,6.0,4.0,4.0,0.0,1.0,1,1968.0,629,0.0,0.0
24074582,2017,12,32864,2018,3,8.0,6.0,3.0,6.0,1.0,1,1982.0,751,0.0,0.0


In [36]:
df, y, nas, mapper = proc_df(train_samp, 'sum_amount', do_scale=True)
yl = np.log(y)

  


In [39]:
df

Unnamed: 0,start_year,start_month,user_id,signup_year,signup_month,postal0,postal1,postal2,postal3,gender,language,birthday,partner_id,previous_visit_count
41924962,2,9,4659,6,11,7,6,6,2,2,1,32,153,-0.047079
24074582,1,12,6006,7,3,9,8,5,8,2,1,46,223,-0.047079
39935211,2,8,4787,6,11,9,2,6,11,2,1,1,177,-0.047079
45944530,2,11,4539,6,11,4,2,2,9,3,1,45,51,-0.047079
31969081,2,4,5275,7,1,1,1,1,1,2,1,1,227,-0.047079
23796880,1,12,5065,7,1,9,2,5,10,2,1,1,116,-0.047079
13825859,1,7,5630,7,2,4,3,9,10,3,1,1,95,-0.047079
36405358,2,6,6567,7,8,9,6,10,10,2,1,1,294,-0.047079
1128525,1,1,3826,6,4,7,2,5,2,3,1,61,151,-0.047079
5312410,1,3,4263,6,10,6,9,5,9,2,1,32,51,-0.047079


In [25]:
#joined = train[cat_vars+contin_vars+[dep]].copy()
joined = test[cat_vars+contin_vars+[dep]].copy()

In [26]:
joined.head()

Unnamed: 0,start_year,start_month,user_id,signup_year,signup_month,postal0,postal1,postal2,postal3,gender,language,birthday,partner_id,previous_visit_count,sum_amount
0,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,7,2,0.0
1,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,12,0,0.0
2,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,16,1,0.0
3,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,19,0,0.0
4,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,20,11,0.0


In [27]:
joined_test = validation;

In [28]:
joined_test[dep] = 0.0;
joined_test = joined_test[cat_vars+contin_vars+[dep]].copy()

In [29]:
joined_test.head()

Unnamed: 0,start_year,start_month,user_id,signup_year,signup_month,postal0,postal1,postal2,postal3,gender,language,birthday,partner_id,previous_visit_count,sum_amount
0,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,7,2,0.0
1,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,12,0,0.0
2,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,16,1,0.0
3,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,19,0,0.0
4,2019,1,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,20,11,0.0


In [30]:
for v in cat_vars: joined[v] = joined[v].astype('category').cat.as_ordered()

In [31]:
for v in contin_vars:
    joined[v] = joined[v].fillna(0).astype('float32')
    joined_test[v] = joined_test[v].fillna(0).astype('float32')

In [32]:
joined.head()

Unnamed: 0,start_year,start_month,user_id,signup_year,signup_month,postal0,postal1,postal2,postal3,gender,language,birthday,partner_id,previous_visit_count,sum_amount
0,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,7,2.0,0.0
1,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,12,0.0,0.0
2,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,16,1.0,0.0
3,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,19,0.0,0.0
4,2019,2,1,2012,12,8.0,0.0,5.0,5.0,1.0,1,1984.0,20,11.0,0.0


In [33]:
df, y, nas, mapper = proc_df(joined, 'sum_amount', do_scale=True)
#yl = np.log(y)