In [2]:
# Create training and validation data sets that mirror the relationship
#   between the training data and the test data

# Based on my script
#   https://www.kaggle.com/aharless/training-and-validation-data
# which is based on Konrad's script
#   https://www.kaggle.com/konradb/validation-set
# and Alexander Firsov's discussion thread
#   https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/51877

import numpy as np
import pandas as pd
import gc

# Data specifications
columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        }
        
# Training data
print( "Extracting training data...")
training = pd.read_csv( "/home/kai/data/kaggle/talkingdata/data/train.csv", 
                        nrows=122071523, 
                        usecols=columns, 
                        dtype=dtypes)
                        
# Validation data
print( "Extracting first chunk of validation data...")
valid1 = pd.read_csv( "/home/kai/data/kaggle/talkingdata/data/train.csv", 
                      skiprows=range(1,144708153), 
                      nrows=7705357, 
                      usecols=columns, 
                      dtype=dtypes)
print( "Extracting second chunk of validation data...")
valid2 = pd.read_csv( "/home/kai/data/kaggle/talkingdata/data/train.csv", 
                      skiprows=range(1,161974466), 
                      nrows=6291379, 
                      usecols=columns, 
                      dtype=dtypes)
valid2 = pd.concat([valid1, valid2])
del valid1
gc.collect()
print( "Extracting third chunk of validation data...")
valid3 = pd.read_csv( "/home/kai/data/kaggle/talkingdata/data/train.csv", 
                      skiprows=range(1,174976527), 
                      nrows=6901686, 
                      usecols=columns, 
                      dtype=dtypes)
valid3 = pd.concat([valid2,valid3])
del valid2
gc.collect()
validation = valid3
del valid3
gc.collect()

print( "\nTraining data:")
print( training.shape )
print( training.head() )
print( "Saving training data...")
training.to_pickle('/home/kai/data/kaggle/talkingdata/data/training-and-validation-data-pickle/training.pkl.gz')

validation.reset_index(drop=True,inplace=True)
print( "\nValidation data:")
print( validation.shape )
print( validation.head() )
print( "Saving validation data...")
validation.to_pickle('/home/kai/data/kaggle/talkingdata/data/training-and-validation-data-pickle/validation.pkl.gz')

print("\nDone")

Extracting training data...
Extracting first chunk of validation data...
Extracting second chunk of validation data...
Extracting third chunk of validation data...

Training data:
(122071523, 7)
       ip  app  device  os  channel           click_time  is_attributed
0   83230    3       1  13      379  2017-11-06 14:32:21              0
1   17357    3       1  19      379  2017-11-06 14:33:34              0
2   35810    3       1  13      379  2017-11-06 14:34:12              0
3   45745   14       1  13      478  2017-11-06 14:34:52              0
4  161007    3       1  13      379  2017-11-06 14:35:08              0
Saving training data...

Validation data:
(20898422, 7)
       ip  app  device  os  channel           click_time  is_attributed
0   38877    2       1  19      477  2017-11-09 04:00:00              0
1  104271   14       1  18      489  2017-11-09 04:00:00              0
2   92922    3       1  13      280  2017-11-09 04:00:00              0
3    6505    3       1  13   

In [8]:
training.tail()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed
122071518,108229,15,1,19,245,2017-11-08 16:00:00,0
122071519,73487,12,2,22,326,2017-11-08 16:00:00,0
122071520,238206,14,1,19,379,2017-11-08 16:00:00,0
122071521,27482,26,1,32,121,2017-11-08 16:00:00,0
122071522,99150,2,2,19,122,2017-11-08 16:00:00,0
