In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import gc
import pickle

## Load data

In [2]:
%%time
# assign dtypes to reduce memory use
dtypes = {
    'click_id': 'uint32',
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}
# attributed time is not read because it is not a feature of test data
train_df = pd.read_csv('input/train.csv', header=0, sep=',', 
                       dtype=dtypes, parse_dates=['click_time'],
                       usecols=['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed'])

Wall time: 4min 46s


In [3]:
%%time
test_df = pd.read_csv('input/test.csv', header=0, sep=',', 
                       dtype=dtypes, parse_dates=['click_time'],
                       usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])
gc.collect();

Wall time: 30.9 s


7

In [4]:
print('Loaded...')

Loaded...


## EDA

In [6]:
# data set sizes in rows
print("Train: "+str(len(train_df))+" rows")
print("Test:  "+str(len(test_df))+" rows")

Train: 184903890 rows
Test:  18790469 rows


In [11]:
# check out rows
train_df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed
0,83230,3,1,13,379,2017-11-06 14:32:21,0
1,17357,3,1,19,379,2017-11-06 14:33:34,0
2,35810,3,1,13,379,2017-11-06 14:34:12,0
3,45745,14,1,13,478,2017-11-06 14:34:52,0
4,161007,3,1,13,379,2017-11-06 14:35:08,0


### Sanity checks
Make sure there are no NaN values.

In [7]:
train_df.isna().sum()

ip               0
app              0
device           0
os               0
channel          0
click_time       0
is_attributed    0
dtype: int64

In [8]:
test_df.isna().sum()

click_id      0
ip            0
app           0
device        0
os            0
channel       0
click_time    0
dtype: int64

Are the click times sorted?

In [10]:
train_df.click_time.is_monotonic, test_df.click_time.is_monotonic

(True, True)

click_time columns are sorted in the data frames