In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

They handle 3 billion clicks per day, of which 90% are potentially fraudulent. Their current approach to prevent click fraud for app developers is to measure the journey of a user’s click across their portfolio, and flag IP addresses who produce lots of clicks, but never end up installing apps. With this information, they've built an IP blacklist and device blacklist.

While successful, they want to always be one step ahead of fraudsters and have turned to the Kaggle community for help in further developing their solution. In their 2nd competition with Kaggle, you’re challenged to build an algorithm that predicts whether a user will download an app after clicking a mobile app ad. To support your modeling, they have provided a generous dataset covering approximately 200 million clicks over 4 days!

In [4]:
df_train_sample=pd.read_csv("./kaggle/train_sample.csv")

In [5]:
df_train_sample.shape

(100000, 8)

In [6]:
df_train_sample.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,87540,12,1,13,497,2017-11-07 09:30:38,,0
1,105560,25,1,17,259,2017-11-07 13:40:27,,0
2,101424,12,1,19,212,2017-11-07 18:05:24,,0
3,94584,13,1,13,477,2017-11-07 04:58:08,,0
4,68413,12,1,1,178,2017-11-09 09:00:09,,0


In [63]:
from sklearn.preprocessing import OneHotEncoder

## Training data Creation:

In [102]:
class training_creation(object):
    def __init__(self, df):
        self.df = df
        self.start_time = df.click_time.min()
        self.end_time = df.click_time.max()
        self.categorical_col = ['app', 'device', 'os','channel']
    
    def _add_click_datetime(self):
        """Add datetime col to dataframe"""
        self.df['click_datetime'] = pd.to_datetime(self.df['click_time'])
    
    def _add_day(self):
        """Add day col to dataframe"""
        if 'click_datetime' in self.df.columns:
            self.df['day'] = self.df.click_datetime.dt.day
        else:
#             self.logger.info('no click datetime column')
            print('no click datetime column')
    
    @property
    def N_days(self):
        """Number of days of dataframe"""
        return max(self.df['day']) - min(self.df['day']) + 1
    
    def _add_hour(self):
        """Add hour col to dataframe"""
        if 'click_datetime' in self.df.columns:
            self.df['hour'] = self.df.click_datetime.dt.hour
        else:
#             self.logger.info('no click datetime column')
            print('no click datetime column')
    
    def _add_minute(self):
        """Add minute to dataframe"""
        if 'click_datetime' in self.df.columns:
            self.df['minute'] = self.df.click_datetime.dt.minute
        else:
#             self.logger.info('no click datetime column')
            print('no click datetime column')
    
    def _add_second(self):
        """Add second col to dataframe"""
        if 'click_datetime' in self.df.columns:
            self.df['second'] = self.df.click_datetime.dt.second
        else:
#             self.logger.info('no click datetime column')
            print('no click datetime column')
    
    def _add_ip_counts_per_hour(self):
        """Add IP clicktime per hour columns to dataframe
        Note: This function will add 24 columns to the dataframe. 
        """
    # input: dataframe with ip, day, hour, minute, secod columns
    # output: dataframe with click counts per hour 
    # Note: This function will add 24 columns to the dataframe. 
    # For the same ip, its feature hx_count has the same value, x between 0 to 23.
    # hourx_count values are devided by number of days (ndays) to get the average counts 
    # over the observed days
        ndays = self.N_days
        df = self.df
        if set(['ip', 'day', 'hour', 'minute', 'second']).issubset(df.columns):
            for i in range(24):
                hour_i = df[df['hour']==i][['ip','hour']].groupby('ip').agg('count')/ndays
                hour_i.reset_index(inplace=True)
                hour_i['h'+str(i)+'_count'] = hour_i['hour']
                hour_i.drop('hour', inplace = True, axis=1)
                df = pd.merge(df, hour_i, on = 'ip',how = 'outer' )
            self.df = df
        else:
            print('no {} column'.format(set(['ip', 'day', 'hour', 'minute', 'second']).difference(set(df.columns))))
            
    def _add_IP_feature(self):
        df = self.df
        ip_feature = df.groupby('ip').agg({'hour':['mean', 'std'], 'os': 'nunique',
                                          'device': 'nunique', 'app': ['nunique','count']}).replace(np.nan, 0).reset_index()
        ip_feature.columns = ['ip', 'hour_mean', 'hour_std', 'os_num', 'device_num', 'app_num', 'click_num']
        self.df = pd.merge(df, ip_feature, on='ip', how='left')

    def _convert_categorical(self):
        enc = OneHotEncoder()
        enc.fit(self.df[self.categorical_col])
        return enc, enc.transform(self.df[self.categorical_col]).toarray()
    
    def transform(self):
        self._add_click_datetime()
        self._add_day()
        self._add_hour()
        self._add_minute()
        self._add_second()
        self._add_ip_counts_per_hour()
        self._add_IP_feature()
        self.enc, cat_array = self._convert_categorical()
        
        X = self.df.drop(self.categorical_col, axis=1)
        self.X = np.concatenate((X, cat_array), axis=1)

In [103]:
test = training_creation(df_train_sample)

In [104]:
test.transform()

In [105]:
X=test.X

In [110]:
test.df.shape

(100000, 43)

In [107]:
X.shape

(100000, 591)