# Import Data

In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pickle
import time
import os.path

In [2]:
# group = pd.read_csv('Group_xx.csv') # format of output
train_raw = pd.read_csv('train.csv')
test_raw = pd.read_csv('test.csv')
validation_raw = pd.read_csv('validation.csv')

In [3]:
def down_sample_train(train, n = 1):
    # down sample click = 0 data so as 'len(click = 0) = n* len(click = 1)'
    
    click_0 = train[ train.click ==0 ]
    click_1 = train[ train.click ==1 ]
    
    np.random.seed(666)
    sampled_click_index = np.random.choice( click_0.index, len(click_1)*n, replace = False)
    sampled_click_df = click_0.loc[ sampled_click_index ]
    
    new_train = pd.concat([sampled_click_df, click_1], axis = 0)
    
    return new_train
    

def split_useragent( df ):
    # split column useragent into 'OS' and 'browser'
    
    user_agent = df['useragent']
    OS = [agent.split('_')[0] for agent in user_agent]
    browser = [agent.split('_')[1] for agent in user_agent]
    df['OS'] = OS
    df['browser'] = browser
    df = pd.concat([df, pd.get_dummies( df['OS'], prefix = 'OS')], axis = 1)
    df = pd.concat([df, pd.get_dummies( df['browser'], prefix = 'browser')], axis = 1)
    df = df.drop(['useragent','OS', 'browser'], axis = 1)
    
    return df 


def split_usertag(df):
    # split user tags to dummy variables 
    
    splitting_tags = df['usertag'].str.get_dummies(sep=',')
    df = pd.concat([df, splitting_tags], axis = 1)
    df = df.drop('usertag', axis = 1)
    
    return df

def create_dummies(df, dummy_variables):
    # one-hot encoding for categorical variables
    
    for feat in dummy_variables:
        
        df = pd.concat([df, pd.get_dummies(df[feat], prefix=feat)],axis=1)
        df = df.drop(feat,axis=1)
    
    return df

def preprocess(df, downSample = False):
    # pipeline for sampling data and transform features 
    
    if downSample:
        df = down_sample_train(df)
    
    # extract OS and browser from useragent
    df = split_useragent(df)
    
    # compute slotsize
    df['slotsize'] = df['slotwidth'] * df['slotheight']

    df = split_usertag(df)

    df = df.drop(['bidid', 'userid','IP','domain','url', 'urlid', 'slotid','slotformat','creative',\
                                 'keypage', 'slotwidth', 'slotheight'], axis = 1)
    
    # create dummies
    dummy_variables = ['region', 'city', 'adexchange', 'slotvisibility', 'advertiser']
    df = create_dummies(df, dummy_variables)
    
    return df

def remove_non_seen_cols(train,df, isTest = False):
    # due to sampling, many values may not seen before, only kept the seen data
    
    missing_col = np.setdiff1d(np.array(train.columns), np.array(df.columns))
    
    for col in missing_col:
        df[col] = 0
    
    if isTest:
        df = df[train.drop(['bidprice', 'click', 'payprice'], axis = 1).columns]
    else:
        df = df[train.columns]
    
    return df


In [4]:
class Time_Tracking():
    
    start_time = None
    
    def start_tracking(self):
        
        self.start_time = time.time()
    
    def stop_tracking(self):
        
        print("Time used:", round(((time.time() - self.start_time)/60),2), ' minutes')

In [5]:
t = Time_Tracking()
t.start_tracking()

train = preprocess(train_raw, downSample = True)
valid = preprocess(validation_raw)
valid = remove_non_seen_cols( train, valid )
test = preprocess(test_raw)
test = remove_non_seen_cols( train, test, isTest=True)

t.stop_tracking()

Time used: 1.03  minutes


# Save to pickle

In [6]:
n_bytes = 2**31
max_bytes = 2**31 - 1


def write_to_pickle(file_path, data):

    bytes_out = pickle.dumps(data)
    with open(file_path, 'wb') as f_out:
        for idx in range(0, n_bytes, max_bytes):
            f_out.write(bytes_out[idx:idx+max_bytes])

write_to_pickle('train_df_after_preprocessing.pkl', train)
write_to_pickle('valid_df_after_preprocessing.pkl', valid)
write_to_pickle('test_df_after_preprocessing.pkl', test)

# How to load pickle

In [7]:
def load_pickle(file_path):
    bytes_in = bytearray(0)
    input_size = os.path.getsize(file_path)
    with open(file_path, 'rb') as f_in:
        for _ in range(0, input_size, max_bytes):
            bytes_in += f_in.read(max_bytes)
    
    return pickle.loads(bytes_in)

In [8]:
#  read pickle        
train_ = load_pickle('train_df_after_preprocessing.pkl')
test_ = load_pickle('test_df_after_preprocessing.pkl')
valid_ = load_pickle('valid_df_after_preprocessing.pkl')

In [9]:
if_train_valid = (train == train_)
if_valid_valid = (valid == valid_)
if_test_valid = (test == test_)

print(if_train_valid)
print(if_valid_valid)
print(if_test_valid)

         click  weekday  hour  slotprice  bidprice  payprice  OS_android  \
367179    True     True  True       True      True      True        True   
1858024   True     True  True       True      True      True        True   
54000     True     True  True       True      True      True        True   
1073484   True     True  True       True      True      True        True   
1185737   True     True  True       True      True      True        True   
115944    True     True  True       True      True      True        True   
592831    True     True  True       True      True      True        True   
2080586   True     True  True       True      True      True        True   
1860949   True     True  True       True      True      True        True   
1724755   True     True  True       True      True      True        True   
1630318   True     True  True       True      True      True        True   
2088154   True     True  True       True      True      True        True   
2004240   Tr