In [57]:
import pandas as pd
import zipfile
import imblearn
from sklearn.model_selection import train_test_split
import numpy as np
from typing import List

In [6]:
# Read in train and test csvs from zipfile
zip_file = zipfile.ZipFile('data/adtracking.zip')
dfs = {text_file.filename: pd.read_csv(zip_file.open(text_file.filename))
       for text_file in zip_file.infolist()
       if text_file.filename == 'train.csv' or text_file.filename =='test.csv'}

In [10]:
# Can see that our classes are severely imbalanced.  We will start by undersamping the majority class to equalize
# May return and compare to SMOTE or other approaches depending on model performance
dfs['train.csv'].is_attributed.value_counts()

0    184447044
1       456846
Name: is_attributed, dtype: int64

We undersample first because while there is a huge class imbalance, there's still a lot of data in the minority class.  This means that the simple approach has a good chance of being effective while reducing required compute resources.

Undersampling before the split means a smaller dataset but no chance of data leakage, and since the minority class is large the drawback isn't too big.

In [13]:
dfs['train.csv'].head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,83230,3,1,13,379,2017-11-06 14:32:21,,0
1,17357,3,1,19,379,2017-11-06 14:33:34,,0
2,35810,3,1,13,379,2017-11-06 14:34:12,,0
3,45745,14,1,13,478,2017-11-06 14:34:52,,0
4,161007,3,1,13,379,2017-11-06 14:35:08,,0


In [15]:
# Get the full X and y data from training
X_full = dfs['train.csv'].drop(columns=['is_attributed', 'attributed_time'])
y_full = dfs['train.csv'][['is_attributed']]

In [22]:
# Instantiate a random undersampler
undersample = imblearn.under_sampling.RandomUnderSampler(sampling_strategy='majority')
# Apply the undersampling transformation
X_us, y_us = undersample.fit_resample(X_full, y_full)

In [25]:
# Split the undersampled data into training and validation dfs, 20% of data going to validation
test_size = 0.2
X_train, X_val, y_train, y_val = train_test_split(X_us, y_us, test_size=test_size, random_state=1233)

Below we'll add our functions to create features.

We'll add qite a few things:
1) From click time, we want to extract the day / hour, ensuring that our times are converted to our timezone
2) Want to find unique users by combinations of our base categories (ip, app, device, os, channel).  May want to hash these - check if feature space is large enough
3) Click counts within next X (1? 2? both?) hour(s) by 'user' category - may want to get this for multiple 'user' categories
4) Time to next click by 'user' category - may want to get this for multiple 'user' categories
5) Time from last click by 'user' category - may want to get this for multiple 'user' categories
6) Avg. attributed ratio of past clicks by 'user' category - may want to get this for multiple 'user' categories
7) Counts of clicks by 'user' category - may want to get this for multiple 'user' categories

In [60]:
def add_hour_day_from_clicktime(df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds the hour and day columns as ints from the click_time column
    Returns the input df with the hour, day columns added.  Acts on a copy of the input df.
    """
    df2 = df.copy()
    df2['hour'] = pd.to_datetime(df2['click_time']).dt.hour.astype('uint8')
    df2['day'] = pd.to_datetime(df2['click_time']).dt.day.astype('uint8')
    return df2

In [58]:
# Want to have a few combinations to try to get 'unique users' which we'll group by
# We'll put lists of columns to group by below
# Hard code instead of doing all combinations so we can easily comment out ones we don't want,
# Also we don't want a lot of the combinations
u_user_lists = [
    # IP with every other base
    ['ip', 'channel'],
    ['ip', 'device'], 
    ['ip', 'os'],
    ['ip', 'app'],
    # IP and time features - must be done after adding time features
    ['ip', 'day', 'hour'],
    # Perhaps IP isn't as important
    ['app', 'channel'],
    # Triplet(s)
    ['ip', 'app', 'os'],
    # Quartet(s)
    ['ip', 'device', 'os', 'app']
    # Exclude all 5 together as these will be used for grouping
]

grouping_functions = ['nunique', 'cumcount']

In [59]:
def add_groupby_user_features(df: pd.DataFrame, grouping_categories: List[List[str]], grouping_functions: List[str]) -> pd.DataFrame:
    """ Takes an input dataframe, list of groupings to use, and a list of grouping functions (currently just allows for nunique and/or cumcount).
        Adds the grouped values to a copy of the input dataframe.  

    Args:
        df (pd.DataFrame): Input dataframe e.g. X_train
        grouping_categories (List[List[str]]): List containing lists of columns to group by as strings
        grouping_functions (List[str]): List containing strings of functions to aggregate with (must be nunique and/or cumcount at the moment)

    Returns:
        pd.DataFrame: Copy of input dataframe with the new aggregated columns added on.
    """
    df2 = df.copy()
    
    for u_list in grouping_categories:
        for grouping_function in grouping_functions:
            new_col_name = "_".join(u_list) + "_" + grouping_function
            if grouping_function == 'nunique':
                grp = df2[u_list].groupby(by=u_list[0:len(u_list)-1])[u_list[len(u_list)-1]].nunique().reset_index().\
                    rename(index=str, columns={u_list[len(u_list)-1]:new_col_name})
                df2 = df2.merge(grp, on=u_list[0:len(u_list)-1], how='left')
            elif grouping_function == 'cumcount':
                grp = df2[u_list].groupby(by=u_list[0:len(u_list)-1])[u_list[len(u_list)-1]].cumcount()
                df2[new_col_name] = grp.values
            else:
                raise ValueError(f"That grouping function {grouping_function} is not currently supported.  Use nunique and/or cumcount.")
    return df2

In [61]:
def log_bin_column(df: pd.DataFrame, collist: List[str]) -> pd.DataFrame:
    """ Log bins the feature columns given in collist

    Args:
        df (pd.DataFrame): Input dataframe.  Copied - not changed.
        collist (List[str]): List of columns to log bin, as strings.

    Returns:
        pd.DataFrame: Copy of the input dataframe with the given columns log-binned.  
    """
    df2 = df.copy()
    for col in collist:
        df2[col] = np.log2(1 + df2[col].values).astype(int)
    return df2

In [62]:
def add_next_click(df: pd.DataFrame) -> pd.DataFrame:
    """ Adds the 'next_click' feature to a dataframe

    Args:
        df (pd.DataFrame): Input dataframe.  Copied - not changed.

    Returns:
        pd.DataFrame: Copy of the input dataframe with the 'next_click' feature added.
    """
    
    max_num_categories = 2**26 # max number of categories in our hash
    df2 = df.copy()
    df2['user_hash'] = (df2['ip'].astype(str) + "_" + df2['app'].astype(str) + "_" + df2['device'].astype(str) \
            + "_" + df2['os'].astype(str)).apply(hash) % max_num_categories
    click_buffer = np.full(max_num_categories, 3000000000, dtype=np.uint32)
    df2['epoch_time'] = df2['click_time'].astype(np.int64) // 10**9 # Get epoch time of each click
    
    next_clicks = [] # Empty list to be filled for next click by user hash
    # This loop goes backwards through each user by time, gets the time of their next click
    for userhash, time in zip(reversed(df2['user_hash'].values), reversed(df2['epoch_time'].values)):
        next_clicks.append(click_buffer[userhash] - time)
        click_buffer[userhash] = time
    # Since we went through backwards, reverse the next clicks and add it as a column
    df2['next_click'] = list(reversed(next_clicks))
    
    # Last clicks in each user hash have high values as we'll do 3000000000 - (click_time) so we need to address this. 
    # We'll write a function to log-bin features and use it within this one.  Separate as we want it for other columns too.
    df2 = log_bin_column(df2, ['next_click'])
    
    return df2