In [7]:
from collections import Counter
import pandas as pd
import numpy as np
import time
import gc

import pprint
pp = pprint.PrettyPrinter(indent=4)

# Pipelining
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split

In [2]:
RANDOM_STATE=42

# Objective

The challenge is to predict the CTR (click through rate). 
The original Kaggle challenge although focus on Fraud but we can still proceed with similar fashion of approach.

This notebook covers the core logic behind the code. Majority of the time was spent on structuring the ETL 
(outside of this notebook).

# Data Extraction

There are 7 features in total, where 5 features are categorical and the other 2 as date features.

In [3]:
categorical_features = ['ip', 'app', 'device', 'os', 'channel']
datetime_features = ['click_time']

`attributed_time` is defined as :
```
if user download the app for after clicking an ad, this is the time of the app download
```

This would therefore make more sense to exclude `attributed_time` as training feature, we could use it for sanity
check instead.

### Type Specification

In [4]:
# Variable types, will be a schema for actual external database
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8',
    'click_id': 'uint32'
}

### Specify Data Source

In [5]:
data_path = "../../data/"

### Data Loading

Due to the limiting size of machine used, total data points used for both train and test are a sub sample of the original
dataset.  

In the use case of actual large data consumption scenarios, distributed framework such as Spark will serve as a 
better tool for parallel batch processing.

In [6]:
data_full = pd.read_csv(data_path + '/train/train.csv', 
                           dtype=dtypes)

### Imbalanced data

Imbalanced dataset can be countered using two general approaches:
    
1. Oversampling  
2. Undersampling  

(1) Oversampling strategies:  
    - Use SMOTE-NC to synthesize more data points using the attributes of nearest-neighbours  
    - Tune scale_pos_weight that increases the weight for minority class
    
(2) Undersampling strategies:  
    - Reduce the negative sample population to get closer of that of the positive ones

### Current sampling strategy

In this problem, the amount of data points are huge - 184,903,891 (about 184 million) and positive sample only
consists of 0.2% of the total population.

In order to let the model pick up more patterns of the positive sample, all positive samples are kept while we 
undersampled the negative samples randomly.

Total number of data points is predefined, minus all positive (minority)sample population which will then be the 
negative (majority) sample population.

In [None]:
num_target_data_points = 1500000

In [8]:
y_label = 'is_attributed'

In [11]:
data_full[y_label].value_counts(0), data_full[y_label].value_counts(1)

(0    184447044
 1       456846
 Name: is_attributed, dtype: int64,
 0    0.997529
 1    0.002471
 Name: is_attributed, dtype: float64)

In [15]:
# Get all positive sample
data_pos_only = data_full[data_full[y_label] == 1]

In [18]:
num_pos_points = data_pos_only.shape[0]

In [20]:
data_neg_sampled = data_full[data_full[y_label] == 0].sample(n=(num_target_data_points - num_pos_points)) 

### Form sampled new dataset

In [21]:
data_sampled = pd.concat([data_neg_sampled, data_pos_only], axis=0).reset_index(drop=True)

In [23]:
labels = data_sampled[y_label]
features = data_sampled.drop(columns=[y_label, 'attributed_time'])

In [24]:
num_data_points, num_features = data_sampled.shape
print(f"{num_data_points} data points with {num_features} features.")

1500000 data points with 8 features.


In [25]:
labels.value_counts(0), labels.value_counts(1)

(0    1043154
 1     456846
 Name: is_attributed, dtype: int64,
 0    0.695436
 1    0.304564
 Name: is_attributed, dtype: float64)

In [26]:
data_sampled.head(n=5)

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,127088,12,1,17,178,2017-11-09 13:47:04,,0
1,84671,12,1,16,328,2017-11-07 03:59:23,,0
2,123994,9,1,15,134,2017-11-07 00:09:19,,0
3,106824,9,1,22,448,2017-11-09 11:00:24,,0
4,94729,3,1,53,173,2017-11-08 14:21:01,,0


# Train Test Split

In [27]:
# Split the dataset into train and test (label name is pre-defined)
features, labels = data_sampled.drop(columns=[y_label]), data_sampled[y_label]

train_X, test_X, train_y, test_y = train_test_split(features, labels,
                                                    test_size=0.2,
                                                    random_state=RANDOM_STATE,
                                                    stratify=labels)

# Basic Stats

### Check Nulls

In [28]:
for col in train_X.columns:
    nan_abs_val = train_X[col].isna().sum()
    nan_percent_val = nan_abs_val / num_data_points * 100
    print(f"{col} have {nan_abs_val} nulls, {nan_percent_val} %.")

ip have 0 nulls, 0.0 %.
app have 0 nulls, 0.0 %.
device have 0 nulls, 0.0 %.
os have 0 nulls, 0.0 %.
channel have 0 nulls, 0.0 %.
click_time have 0 nulls, 0.0 %.
attributed_time have 834523 nulls, 55.634866666666674 %.


### Check Unique (only for categorical)

In [29]:
for col in categorical_features:
    num_cat_uniq = train_X[col].nunique()
    percent_cat_uniq = num_cat_uniq / num_data_points * 100
    print(f"{col} have unique number of categories: {num_cat_uniq}, {percent_cat_uniq:.3g}% .")

ip have unique number of categories: 222624, 14.8% .
app have unique number of categories: 346, 0.0231% .
device have unique number of categories: 1755, 0.117% .
os have unique number of categories: 201, 0.0134% .
channel have unique number of categories: 179, 0.0119% .


### Check categorical encoded range and common elements

In [30]:
for col in categorical_features:
    print('='*30)
    print(f"For {col}, min is {train_X[col].min()}, max is {train_X[col].max()}")
    c = Counter(train_X[col]).most_common(10)
    top_10_common_elements = sorted(c, key=lambda x:x[1], reverse=True)
    
    print(f"Top 10 most common {col} categories with frequency:") 
    pp.pprint(top_10_common_elements)

For ip, min is 1, max is 364777
Top 10 most common ip categories with frequency:
[   (5348, 7472),
    (5314, 6872),
    (73487, 4381),
    (73516, 4255),
    (53454, 2377),
    (26995, 2086),
    (95766, 2048),
    (114276, 2041),
    (105475, 1634),
    (17149, 1555)]
For app, min is 0, max is 768
Top 10 most common app categories with frequency:
[   (3, 161335),
    (12, 111104),
    (19, 109574),
    (2, 103113),
    (9, 89327),
    (18, 77639),
    (15, 74852),
    (35, 51379),
    (14, 47007),
    (29, 34801)]
For device, min is 0, max is 4223
Top 10 most common device categories with frequency:
[   (1, 1032639),
    (0, 85722),
    (2, 38316),
    (3032, 3112),
    (6, 2387),
    (40, 2169),
    (16, 1766),
    (3543, 1240),
    (18, 1178),
    (21, 948)]
For os, min is 0, max is 866
Top 10 most common os categories with frequency:
[   (19, 263336),
    (13, 228949),
    (17, 52940),
    (18, 49882),
    (22, 43947),
    (24, 39512),
    (0, 31873),
    (10, 28849),
    (8, 2850

### Check label distribution

In [31]:
train_y.value_counts(1)

0    0.695436
1    0.304564
Name: is_attributed, dtype: float64

# Feature Generation Strategy

Features would be generated via sklearn transformer to retain any states and for clearer code structural consturction.

In [32]:
class FeatureExtractorTransformer(TransformerMixin):

    def __init__(self, feature_list):
        self.feature_list = feature_list

    def transform(self, input_df):
        # Return selected features from dataframe
        return input_df[self.feature_list]

    def fit(self, *_):
        return self

## 1. Categorical

Most common and simplest way to deal with categorical features is one-hot encoding. 
However, the dimensions to deal with here are quite high.

We can reduce the cardinality by picking only top n most frequent cateogries and group the rest into a single category.

In [33]:
class CategoricalReduceTransformer(TransformerMixin):

    def __init__(self, config={}):

        self.top_n = config.get('top_n', None)
        self.feature_list = config.get('feature_list', None)
        self.default_column_names = config.get('default_column_names', None)

        self.one_hot_columns = None
        self.all_columns = None
        self.top_n_cats = {}

    def transform(self, input_df, **transform_params):

        # Check if the input is dataframe, if not convert to dataframe with set columns
        if not isinstance(input_df, pd.DataFrame):
            input_df = pd.DataFrame(input_df, columns=self.default_column_names)

        if self.feature_list is not None:
            feat_list = self.feature_list
        else:
            feat_list = input_df.columns

        one_hot_cols = []
        # Apply one_hot coding for all features in feature list
        for col in feat_list:
            input_df[col] = input_df[col].map(str)

            if self.top_n_cats:
                # New / unknown value will be treated as minority
                # Top n most frequent categories and None values retained respective encoding
                input_df.loc[~input_df[col].isin(self.top_n_cats[col]), col] = '-1'

            # Get the one_hot coding
            one_hot_df = pd.get_dummies(input_df[col], prefix=col)

            # Drop original feature from dataset
            input_df = input_df.drop(columns=[col])

            # Add one hot coding instead of original feature
            input_df = pd.concat([input_df, one_hot_df], axis=1)

            # Keep track of one_hot columns in train set
            one_hot_cols.extend(one_hot_df.columns)
            
        # If transformer has stored state (fit was used), assign unknown / new category with 0
        missing_columns = list(set(self.one_hot_columns) - set(one_hot_cols))

        # Assign any missing columns as all zeros
        input_df[missing_columns] = 0

        # Check the order of columns are the same
        input_df = input_df[list(self.all_columns) + list(self.one_hot_columns)]

        return input_df

    def fit(self, input_df, *_):

        # Check if the input is dataframe
        if not isinstance(input_df, pd.DataFrame):
            input_df = pd.DataFrame(input_df, columns=self.default_column_names)

        if self.feature_list is not None:
            feat_list = self.feature_list
        else:
            feat_list = input_df.columns

        one_hot_cols = []

        # Apply one hot coding for all features in feature list
        for col in feat_list:
            input_df[col] = input_df[col].map(str)

            if self.top_n is not None:
                # Get top n most frequent categories, replace the minority as single class
                cat_counter = Counter(input_df[col]).most_common(self.top_n)
                self.top_n_cats[col] = set([c[0] for c in cat_counter] + ['nan'])
                input_df.loc[~input_df[col].isin(self.top_n_cats[col]), col] = '-1'

            # Get the one_hot coding
            one_hot_df = pd.get_dummies(input_df[col], prefix=col)

            # Drop original feature from dataset
            input_df = input_df.drop(columns=[col])

            # Keep track of one_hot columns in train set
            one_hot_cols.extend(one_hot_df.columns)

        self.one_hot_columns = one_hot_cols
        self.all_columns = input_df.columns

        return self

    def get_feature_list(self):
        return list(self.all_columns) + list(self.one_hot_columns)

## 2. Datetime

Datetime feature can be generated by breaking down to individual temporal dimension i.e hour, minutes etc.

These can then be further break down into combination of sin and cos - cyclic representation of features.

In [34]:
dt_mappings = {
    'month': 12,
    'day': 31,
    'weekday': 7,
    'hour': 24,
    'minute': 60,
    'second': 60
}

In [35]:
class DateTimeTransformer(TransformerMixin):

    def __init__(self, feature_list=None):
        self.feature_list = feature_list

    def transform(self, df):
        # Only collect transformed feature columns
        dt_feats = pd.DataFrame()

        for col in df.columns:
            # Extract datetime series
            dt_series = df[col]

            if isinstance(dt_series, pd.Series):
                try:
                    dt_series = pd.to_datetime(dt_series)

                    for dt_scale, dt_mval in dt_mappings.items():
                        # Since sin-cos is represented in a circular fashion, (0,0) is never reached
                        # This is hence used to indicate None
                        conv_dt_series = 2*np.pi*getattr(dt_series.dt, dt_scale) / dt_mval
                        dt_feats[f'{col}_sin_{dt_scale}'] = np.sin(conv_dt_series).fillna(0)
                        dt_feats[f'{col}_cos_{dt_scale}'] = np.cos(conv_dt_series).fillna(0)

                except Exception:
                    raise ValueError("Invalid datetime object or string.")
            else:
                raise ValueError("Input must be Pandas Series type.")

        return dt_feats

    def fit(self, *_):
        # This should be a stateless transformer
        return self


# Pipeline Construction

In [36]:
from sklearn.pipeline import Pipeline, FeatureUnion

In [37]:
features_config = {
    'category': {
        'top_n': 5,
        'feature_list': ['ip', 'app', 'device', 'os', 'channel'],
    },
    'datetime': ['click_time']
}

In [38]:
categorical_config = features_config['category']
datetime_feats = features_config['datetime']

## 1. Feature Block

In [39]:
preprocessor_pipeline = FeatureUnion([
        ('category', Pipeline([
            ('extract', FeatureExtractorTransformer(categorical_config['feature_list'])),
            ('one_hot', CategoricalReduceTransformer(categorical_config))
        ])),
        ('datetime', Pipeline([
            ('extract', FeatureExtractorTransformer(datetime_feats)),
            ('datetime', DateTimeTransformer())
        ]))
])

## 2. Model Block

In [40]:
from xgboost import XGBClassifier

class XGBoost:
    def __init__(self, extra_grid_params={}):
        self.name = 'xgboost'
        self.model = XGBClassifier
        self.grid_params = {
            'clf__n_estimators': np.arange(100, 300, 100),  # number of trees
            'clf__learning_rate': [0.1],
            'clf__max_depth': np.arange(2, 8, 3),  # max number of levels in each decision tree,
        }
        self.grid_params.update(extra_grid_params)

    def gen_model_grid_params(self):
        model_content = {
            'model': self.model,
            'params': self.grid_params
        }
        return model_content

In [41]:
ml_pipeline = XGBoost()

## ML Flow Pipeline

In [42]:
ml_flow_pipeline = Pipeline([
    ('features', preprocessor_pipeline),
    ('clf', ml_pipeline.model())
])

# Model Training

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, GridSearchCV

In [44]:
score_metric = 'roc_auc'
n_jobs = 4
n_inner_fold = 3

In [45]:
model_train_start = time.time()

# Stratified K Fold validation as to maintain the imbalanced data distribution within folds
k_fold = StratifiedKFold(n_splits=n_inner_fold, random_state=RANDOM_STATE)

# Perform GridSearch Cross Validation    
model = GridSearchCV(estimator=ml_flow_pipeline, 
                     param_grid=ml_pipeline.grid_params, 
                     scoring=score_metric, 
                     cv=k_fold, 
                     n_jobs=n_jobs)

# Train the model
model.fit(train_X, train_y)

model_train_end = time.time()

print(f"Model training took {model_train_end-model_train_start:.5} seconds.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Model training took 559.76 seconds.


# Model Evaluation

Considering we might be interested to use multiple metrics to measure the performance, we can create a class
to accept the trained model and targeted performance metric. This can then be used to generate or save any 
visualisations (if needed). 

In [46]:
from sklearn.metrics import roc_auc_score, log_loss, recall_score

available_metrics = {
    'roc_auc_score': roc_auc_score,
    'log_loss': log_loss,
    'recall_score': recall_score
}

def get_feval(eval_metric):
    if eval_metric not in available_metrics:
        raise ValueError(f"{eval_metric} is not available. Available metrics are {list(available_metrics.keys())}. ")

    return available_metrics[eval_metric]

In [47]:
class ModelPerformance:
    def __init__(self, model, metrics=None):
        self.model = model
        if metrics is None:
            self.metrics = set()
        else:
            self.metrics = metrics

    def set_metrics(self, metrics):
        if isinstance(metrics, list):
            metrics = set(metrics)

        self.metrics = metrics

    def evaluate(self, data, y_true):
        # Performance
        performance_cache = {}

        # Get prediction
        y_pred = self.model.predict(data)

        for metric in self.metrics:
            performance_cache[metric] = get_feval(metric)(y_true, y_pred)

        return performance_cache

### Performance Measurement

In [48]:
ml_metrics = ['recall_score', 'roc_auc_score']
ml_eval = ModelPerformance(model, ml_metrics)

In [49]:
performance_metrics = ml_eval.evaluate(test_X, test_y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [50]:
for metric_name, metric_val in performance_metrics.items():
    print(f"Trained model achieved score {metric_val} of {metric_name}.")

Trained model achieved score 0.5279252262802482 of recall_score.
Trained model achieved score 0.7577171366768948 of roc_auc_score.
