# Preprocessing 1

In this notebook, we will process the data by using preprocessing method 1, as shown below:
 - Impute missing values in numerical features by 0, or 1;
 - Impute missing values in categorical features a new class;
 - Encode the categorical features by integer.

Import the libraries:

In [1]:
import numpy as np 
import pandas as pd 
import json 
import matplotlib.pyplot as plt 
import seaborn as sns 
import random
import os
import gc
from ast import literal_eval
from pandas.io.json import json_normalize
from sklearn.preprocessing import LabelEncoder

## 1. Read Input

In [2]:
gc.enable()
#print(os.listdir('rawData'))
df_train = pd.read_csv('rawData/train_init.csv', dtype={'fullVisitorId': 'str', 'trafficSource.adwordsClickInfo.page':'str'}, low_memory=False)
df_test = pd.read_csv('rawData/test_init.csv', dtype={'fullVisitorId': 'str', 'trafficSource.adwordsClickInfo.page':'str'}, low_memory=False)

## 2. Data Cleaning

Drop the columns with unique value:

In [3]:
unique_col = [col for col in df_train.columns if df_train[col].nunique() == 1]
df_train.drop(unique_col, axis=1, inplace=True)
df_test.drop(unique_col, axis=1, inplace=True)
df_train.shape

(1365253, 35)

Convert "date" to year/month/day/weekday:

In [4]:
def date_process(df):
    df['date'] = df['date'].astype(str) # transforming the date column in string
    df["date"] = df["date"].apply(lambda x : x[:4] + "-" + x[4:6] + "-" + x[6:]) # setting a new format to date column yyyy-mm-dd
    df["date"] = pd.to_datetime(df["date"]) # seting the column as pandas datetime
    df["weekday"] = df['date'].dt.weekday #extracting week day
    df["year"] = df['date'].dt.year #extracting the year 
    df["day"] = df['date'].dt.day # extracting day
    df["month"] = df['date'].dt.month #extracting month
    df.drop(["date"], axis=1, inplace=True)
    
    return df #returning the df after the transformations

df_train = date_process(df_train)
df_test = date_process(df_test)

Convert object features that should be numerical:

In [5]:
def obj2num(df, col_obj):
    for col in col_obj:
        df[col] = df[col].astype(float)
    
    return df

col_names = ['totals.hits', 'totals.pageviews', 'totals.sessionQualityDim',
             'totals.timeOnSite','totals.transactions']

df_train = obj2num(df_train, col_names + ['totals.transactionRevenue'])
df_test = obj2num(df_test, col_names)

Drop the useless features:

In [6]:
col_names = ['trafficSource.adwordsClickInfo.gclId','visitId', 'visitStartTime']

df_train.drop(col_names, axis=1, inplace=True)
df_test.drop(col_names, axis=1, inplace=True)

Drop the leakage features:

In [7]:
col_names = ['totals.transactions','totals.totalTransactionRevenue']

df_train.drop(col_names, axis=1, inplace=True)
df_test.drop(col_names, axis=1, inplace=True)

## 3. Preprocessing

### Perform log1p transformation on target:

In [8]:
df_train['totals.transactionRevenue'] = np.log1p(df_train['totals.transactionRevenue'])

  """Entry point for launching an IPython kernel.


Drop two features without majority:

In [9]:
feature_drop = ['geoNetwork.networkDomain','trafficSource.keyword']
df_train.drop(feature_drop,axis=1,inplace=True)
df_test.drop(feature_drop,axis=1,inplace=True)

### Imputation:

In [10]:
# impute 0 to nan in 'totals.transactionRevenue'
df_train['totals.transactionRevenue'].fillna(0.0, inplace=True)

# give nan in 'customDimensions.value' a new category
df_train['customDimensions.value'].fillna('NAN', inplace=True)
df_test['customDimensions.value'].fillna('NAN', inplace=True)

# impute 1 to nan in 'totals.pageviews'
df_train['totals.pageviews'].fillna(1, inplace=True)
df_test['totals.pageviews'].fillna(1, inplace=True)

# impute 1 to nan in 'totals.sessionQualityDim'
df_train['totals.sessionQualityDim'].fillna(1, inplace=True)
df_test['totals.sessionQualityDim'].fillna(1, inplace=True)

# impute 1 to nan in 'totals.timeOnSite'
df_train['totals.timeOnSite'].fillna(1, inplace=True)
df_test['totals.timeOnSite'].fillna(1, inplace=True)

# impute 0 to nan in 'totals.transactions'
#df_train['totals.transactions'].fillna(0.0, inplace=True)
#df_test['totals.transactions'].fillna(0.0, inplace=True)

# give nan in 'trafficSource.adContent' a new category
df_train['trafficSource.adContent'].fillna('NAN', inplace=True)
df_test['trafficSource.adContent'].fillna('NAN', inplace=True)

# give nan in 'trafficSource.adwordsClickInfo.adNetworkType' a new category
df_train['trafficSource.adwordsClickInfo.adNetworkType'].fillna('NAN', inplace=True)
df_test['trafficSource.adwordsClickInfo.adNetworkType'].fillna('NAN', inplace=True)

# give nan in 'trafficSource.adwordsClickInfo.page' a new category
df_train['trafficSource.adwordsClickInfo.page'].fillna('NAN', inplace=True)
df_test['trafficSource.adwordsClickInfo.page'].fillna('NAN', inplace=True)

# give nan in 'trafficSource.adwordsClickInfo.slot' a new category
df_train['trafficSource.adwordsClickInfo.slot'].fillna('NAN', inplace=True)
df_test['trafficSource.adwordsClickInfo.slot'].fillna('NAN', inplace=True)

# give nan in 'trafficSource.referralPath' a new category
df_train['trafficSource.referralPath'].fillna('NAN', inplace=True)
df_test['trafficSource.referralPath'].fillna('NAN', inplace=True)

print(df_train['trafficSource.adwordsClickInfo.slot'].unique())
print(df_test['trafficSource.adwordsClickInfo.slot'].unique())
df_test['trafficSource.adwordsClickInfo.slot'].replace(['Google Display Network'], 'NAN', inplace=True)


['NAN' 'Top' 'RHS']
['NAN' 'Top' 'RHS' 'Google Display Network']


### Encode labels in catergorical features with value [0, n_classes - 1]:

In [11]:
def process_TestOnlyCategory(df_train, df_test, col):
    # set the level only present in the test set to 'Other'
    
    level_train = df_train[col].unique()
    level_test = df_test[col].unique()
    level_testOnly = list(set(level_test) - set(level_train))
    
    if len(level_testOnly):
        df_test[col].replace(level_testOnly, 'Other', inplace=True)

    return df_test

def mergeCategory(df_train, df_test, col, threshold):
    threshold = threshold * df_train.shape[0]
    
    df_count = df_train[col].value_counts()   
    if np.min(df_count) < threshold:
        to_merge = df_count[df_count < threshold].index
        df_train[col].replace(to_merge, 'Other', inplace=True)
        df_test[col].replace(to_merge, 'Other', inplace=True)
        
    return df_train, df_test

def encode_Category(df_train, df_test, col, col_y='totals.transactionRevenue'):     
    #sort_class_label = df.groupby([col])[col_y].mean().sort_values(ascending=False).index
    
    le = LabelEncoder()
    le.fit(df_train[col])
    #print(le.classes_)
    df_train[col]=le.transform(df_train[col]) 
    df_test[col]=le.transform(df_test[col]) 
    
    return df_train, df_test
    

In [12]:
col_category = ['channelGrouping', 'customDimensions.value', 'device.browser', 'device.deviceCategory',
               'device.isMobile', 'device.operatingSystem', 'geoNetwork.city', 'geoNetwork.continent',
                'geoNetwork.country','geoNetwork.metro', 'geoNetwork.region', 'geoNetwork.subContinent',
                'trafficSource.adContent', 'trafficSource.adwordsClickInfo.adNetworkType', 
                'trafficSource.adwordsClickInfo.page','trafficSource.adwordsClickInfo.slot',
                'trafficSource.campaign','trafficSource.medium','trafficSource.referralPath',
                'trafficSource.source'
               ]

for col in col_category:
    print('processing:',col)
    df_test = process_TestOnlyCategory(df_train, df_test, col)
    df_train, df_test = mergeCategory(df_train, df_test, col, 0.01)
    df_train, df_test = encode_Category(df_train, df_test, col)
    
print('Done')

processing: channelGrouping
processing: customDimensions.value
processing: device.browser
processing: device.deviceCategory
processing: device.isMobile
processing: device.operatingSystem
processing: geoNetwork.city
processing: geoNetwork.continent
processing: geoNetwork.country
processing: geoNetwork.metro
processing: geoNetwork.region
processing: geoNetwork.subContinent
processing: trafficSource.adContent
processing: trafficSource.adwordsClickInfo.adNetworkType
processing: trafficSource.adwordsClickInfo.page
processing: trafficSource.adwordsClickInfo.slot
processing: trafficSource.campaign
processing: trafficSource.medium
processing: trafficSource.referralPath
processing: trafficSource.source
Done


Show the overview of training set:

In [13]:
df_train.head()

Unnamed: 0,channelGrouping,fullVisitorId,visitNumber,customDimensions.value,device.browser,device.deviceCategory,device.isMobile,device.operatingSystem,geoNetwork.city,geoNetwork.continent,...,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.medium,trafficSource.referralPath,trafficSource.source,weekday,year,day,month
0,1,6849420071276447389,1,4,6,0,0,3,5,1,...,1,0,0,0,5,0,0,2016,1,8
1,6,1231590494443171080,2,4,1,0,0,1,3,1,...,1,0,0,0,0,0,0,2016,1,8
2,1,6751290259214798595,1,4,1,0,0,3,9,1,...,1,0,0,0,5,0,0,2016,1,8
3,6,3213840074316400693,2,4,1,0,0,3,9,1,...,1,0,0,0,0,0,0,2016,1,8
4,6,2987126973357946088,1,4,1,0,0,3,3,1,...,1,0,0,0,0,0,0,2016,1,8


In [14]:
dtype = pd.DataFrame(df_train.dtypes,columns=['dtype'])
count_unique = pd.DataFrame(df_train.nunique(), columns=['count of unique']) 
count_nan = pd.DataFrame(df_train.isnull().sum(), columns=['num of nan']) 
percentage_nan = pd.DataFrame(df_train.isnull().sum()/df_train.shape[0]*100.0, columns=['% of nan'])
info = pd.concat([dtype, count_unique, count_nan, percentage_nan], axis=1)
info

Unnamed: 0,dtype,count of unique,num of nan,% of nan
channelGrouping,int64,8,0,0.0
fullVisitorId,object,1065112,0,0.0
visitNumber,int64,419,0,0.0
customDimensions.value,int64,6,0,0.0
device.browser,int64,7,0,0.0
device.deviceCategory,int64,3,0,0.0
device.isMobile,int64,2,0,0.0
device.operatingSystem,int64,7,0,0.0
geoNetwork.city,int64,10,0,0.0
geoNetwork.continent,int64,6,0,0.0


Show the overview of test set:

In [15]:
df_test.head()

Unnamed: 0,channelGrouping,fullVisitorId,visitNumber,customDimensions.value,device.browser,device.deviceCategory,device.isMobile,device.operatingSystem,geoNetwork.city,geoNetwork.continent,...,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.medium,trafficSource.referralPath,trafficSource.source,weekday,year,day,month
0,3,2117155181258233683,1,3,1,0,0,5,9,3,...,1,0,0,4,5,4,0,2018,1,1
1,3,1921388733739279432,1,0,1,0,0,5,9,2,...,1,0,0,4,5,4,0,2018,1,1
2,3,2496114236550828875,1,2,3,0,0,5,9,3,...,1,0,0,4,5,4,0,2018,1,1
3,3,7591683313761032252,1,2,1,1,1,0,9,3,...,1,0,0,4,5,4,0,2018,1,1
4,3,4942061901422712831,1,4,6,1,1,6,9,1,...,1,0,0,4,5,4,0,2018,1,1


In [16]:
dtype = pd.DataFrame(df_test.dtypes,columns=['dtype'])
count_unique = pd.DataFrame(df_test.nunique(), columns=['count of unique']) 
count_nan = pd.DataFrame(df_test.isnull().sum(), columns=['num of nan']) 
percentage_nan = pd.DataFrame(df_test.isnull().sum()/df_test.shape[0]*100.0, columns=['% of nan'])
info = pd.concat([dtype, count_unique, count_nan, percentage_nan], axis=1)
info

Unnamed: 0,dtype,count of unique,num of nan,% of nan
channelGrouping,int64,8,0,0.0
fullVisitorId,object,266422,0,0.0
visitNumber,int64,362,0,0.0
customDimensions.value,int64,6,0,0.0
device.browser,int64,7,0,0.0
device.deviceCategory,int64,3,0,0.0
device.isMobile,int64,2,0,0.0
device.operatingSystem,int64,7,0,0.0
geoNetwork.city,int64,10,0,0.0
geoNetwork.continent,int64,6,0,0.0


## 4. Output Preprocessed Data

In [18]:
df_train.to_csv('train_P1.csv', index=False)
print('Output preprocessed training set done.')
df_test.to_csv('test_P1.csv', index=False)
print('Output preprocessed test set done.')

Output preprocessed training set done.
Output preprocessed test set done.
