In [35]:
import numpy as np
import pandas as pd
import random

# from sklearn.metrics import log_loss
# train_file = "../data/CTR/train.csv"
# train = pd.read_csv(train_file, nrows=1000000)

# Note that the sampling steps take a while. At least a couple minutes

import os

root_path = os.path.dirname(os.getcwd())

train_file = os.path.join(root_path, "data/CTR/train.csv")
train_less_file = os.path.join(root_path, "data/CTR/train_less.csv")
    
def create_sample_and_save(sample_size):
    
    print ("Counting lines...")
    with open(train_file) as f:
        file_length = sum(1 for line in f) - 1
    
    print ("Generating sampling indices...")
    skip = sorted(random.sample(range(1,file_length+1),file_length-sample_size)) 
    
    print ("Generating pandas dataframe...")
    train = pd.read_csv(train_file, skiprows=skip)
    
    print ("Saving to file...")
    train.to_csv(train_less_file, index=False)

    
sample_size = int(5*10**6)

if not os.path.exists(train_less_file):
    print (train_less_file)
    create_sample_and_save(sample_size)

train = pd.read_csv(train_less_file)

/home/michael_cmx/repos/kaggle-exploration/data/CTR/train_less.csv
Counting lines...
Generating sampling indices...
Generating pandas dataframe...
Saving to file...


In [36]:
# Reduce it even further because I don't know why my memory is so damn low
train = train[:int(10**6)]

In [37]:
train.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,10001264480619467364,0,14102100,1002,0,84c7ba46,c4e18dd6,50e219e0,ecad2386,7801e8d9,...,0,0,21689,320,50,2496,3,167,100191,23
1,10005541670676403131,0,14102100,1005,1,e151e245,7e091613,f028772b,ecad2386,7801e8d9,...,1,0,20984,320,50,2371,0,551,100217,46
2,10005951398749600249,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,-1,79
3,10006789981076459409,0,14102100,1005,0,030440fe,08ba7db9,76b2941d,ecad2386,7801e8d9,...,1,0,20596,320,50,2161,0,35,-1,157
4,10009699694430474960,1,14102100,1005,0,4dd0a958,79cf0c8d,f028772b,ecad2386,7801e8d9,...,1,0,20366,320,50,2333,0,39,-1,157


## Baseline score

In [38]:
from sklearn.metrics import log_loss

In [39]:
msk = np.random.rand(len(train)) < 0.8
features = [3,4,5,6,7,8,9,10,11,13,14,15,16,17,18,19,20,21,22,23]
X_train = train[msk].iloc[:,features]
X_test = train[~msk].iloc[:,features]
y_train = train[msk].iloc[:,1]
y_test = train[~msk].iloc[:,1]

In [43]:
print(log_loss(y_test.values,np.ones(len(y_test))*y_train.mean()))

0.4480827875044258


## Models
We're going to use two simple models for now: the default Logistic Regression and Random Forest models in sklearn

In [44]:
import sklearn.ensemble as sk_ensemble
random_forest = sk_ensemble.RandomForestClassifier(n_estimators=25, max_depth=10)

import sklearn.linear_model as sk_linear_model
logistic_regression = sk_linear_model.LogisticRegression(solver="liblinear")

# Pre-processing
Really, the whole point of this notebook is to explore the various options provided by sklearn's preprocessing library

In [45]:
from sklearn import preprocessing

## Method 1: Encoding to ordinal variables

There are two major classes of categorical data, nominal and ordinal.

In any nominal categorical data attribute, there is no concept of ordering amongst the values of that attribute. e.g. movie genres, type of weather (sunny, rainy, cloudy), etc.

Ordinal categorical attributes have some sense or notion of order amongst its values. e.g. shirt sizes, education level etc.

From the blog:
`Our first method is changing every categorical feature to an ordinal one. The order will be selected randomly (for example, like the order in the dataset or in an alphabetical order). This method does not make much sense because if we are encoding New York as 1, Tehran as 2 and New Jersey as 3, our algorithm will assume that Tehran is more similar to New York than New Jersey.`

My thoughts:

Based on the dataset values, it does seem like the categorical features tend to be more nominal than ordinal, so I can't disagree with this statement.

In [46]:
train.columns

Index(['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'],
      dtype='object')

In [47]:
X_train_ordinal = X_train.values
X_test_ordinal = X_test.values

We can encode each feature with `sklearn.preprocessing.LabelEncoder` - `Encode labels with value between 0 and n_classes-1.`

In [48]:
label_encodings = []
for i in range(X_train_ordinal.shape[1]):
    label_encoding = preprocessing.LabelEncoder()
    # We fit the label_encoding with our column of the categorical feature
    # we combine the ordinals so that we can "see" every feature otherwise labelencoder will be missing
    # some features (this should not lead to data leakage)
    combined_ordinal = np.concatenate([X_train_ordinal[:, i], X_test_ordinal[:, i]])
    label_encoding.fit(combined_ordinal.astype(str)) 
    label_encodings.append(label_encoding)
    
    # Then we also want to convert the features to their encodings
    X_train_ordinal[:, i] = label_encoding.transform(X_train_ordinal[:, i].astype(str))
    X_test_ordinal[:, i] = label_encoding.transform(X_test_ordinal[:, i].astype(str))
    
    # So we know the progress
    print (i/X_train_ordinal.shape[1], end=", ")

0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 

In [49]:
logistic_regression.fit(X_train_ordinal, y_train)
y_pred = logistic_regression.predict_proba(X_test_ordinal)
print(log_loss(y_test, y_pred))

0.4401909774421925


In [50]:
random_forest.fit(X_train_ordinal, y_train)
y_pred = random_forest.predict_proba(X_test_ordinal)
print(log_loss(y_test, y_pred))

0.4005714379710452


In [51]:
one_hot_encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')

combined_ordinal = np.concatenate([X_train_ordinal, X_test_ordinal])

one_hot_encoder.fit(X_train_ordinal)
X_train_one_hot = one_hot_encoder.transform(X_train_ordinal)
X_test_one_hot = one_hot_encoder.transform(X_test_ordinal)

In [52]:
logistic_regression.fit(X_train_one_hot,y_train)
y_pred = logistic_regression.predict_proba(X_test_one_hot)
print(log_loss(y_test,y_pred))

0.3867051622724999


In [53]:
random_forest.fit(X_train_one_hot,y_train)
y_pred = random_forest.predict_proba(X_test_one_hot)
print(log_loss(y_test,y_pred))
print(X_train_one_hot.shape)

0.43581061217782285
(799839, 123110)


# Encoding rare values

In [60]:
X_train["C1"]

0         1002
1         1005
2         1005
3         1005
5         1005
          ... 
999994    1005
999995    1005
999997    1005
999998    1005
999999    1005
Name: C1, Length: 799839, dtype: int64

In [65]:
X_train["C1"].value_counts()[X_train["C1"]].values < 20

array([False, False, False, ..., False, False, False])

In [None]:
import copy
X_train_rare = copy.copy(X_train)
X_test_rare = copy.copy(X_test)
X_train_rare["test"] = 0
X_test_rare["test"] = 1


In [67]:
import copy
X_train_rare = copy.copy(X_train)
X_test_rare = copy.copy(X_test)
X_train_rare["test"]=0
X_test_rare["test"]=1
temp_df = pd.concat([X_train_rare,X_test_rare],axis=0)
names = list(X_train_rare.columns.values)
temp_df = pd.concat([X_train_rare,X_test_rare],axis=0)
for i in names:
    temp_df.loc[temp_df[i].value_counts()[temp_df[i]].values < 20, i] = "RARE_VALUE"
for i in range(temp_df.shape[1]):
    temp_df.iloc[:,i]=temp_df.iloc[:,i].astype('str')
X_train_rare = temp_df[temp_df["test"]=="0"].iloc[:,:-1].values
X_test_rare = temp_df[temp_df["test"]=="1"].iloc[:,:-1].values

label_encoders = []
for i in range(X_train_rare.shape[1]):
    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(temp_df.iloc[:,:-1].iloc[:, i])
    label_encoders.append(label_encoder)
    X_train_rare[:, i] = label_encoder.transform(X_train_rare[:, i])
    X_test_rare[:, i] = label_encoder.transform(X_test_rare[:, i])
    
one_hot_encoder.fit(X_train_rare)
X_train_rare = one_hot_encoder.transform(X_train_rare)
X_test_rare = one_hot_encoder.transform(X_test_rare)

In [68]:
logistic_regression.fit(X_train_rare,y_train)
y_pred = logistic_regression.predict_proba(X_test_rare)
print(log_loss(y_test,y_pred))
random_forest.fit(X_train_rare,y_train)

y_pred = random_forest.predict_proba(X_test_rare)
print(log_loss(y_test,y_pred))
print(X_train_rare.shape)

0.38783155508577577
0.4230776412195197
(799839, 5264)
