Predict CLR with Avazu data using feature engineering and Random Forest
import pandas as pd
import numpy as np
p - the model's predicted probability that observation i is of class C
Compute AUC score by hand with the formula explained in class for the following dataset.
d = pd.DataFrame({
'prediction': [0.1, 0.5, 0.95, 0.99, 0.8, 0.4, 0.03, 0.44, 0.2],
'y': [1, 0, 1, 1, 1, 1, 0, 0, 0]})
d
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
</style>
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
prediction | y | |
---|---|---|
0 | 0.10 | 1 |
1 | 0.50 | 0 |
2 | 0.95 | 1 |
3 | 0.99 | 1 |
4 | 0.80 | 1 |
5 | 0.40 | 1 |
6 | 0.03 | 0 |
7 | 0.44 | 0 |
8 | 0.20 | 0 |
d.sort_values(by=["prediction"],ascending=False,inplace=True)
pairs_num = d.groupby('y').count()['prediction'].product()
d['1_b4_0_count'] = np.cumsum(d['y'])
print("AUC =", sum(d[d["y"]==0]["1_b4_0_count"])/pairs_num)
d
AUC = 0.75
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
</style>
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
prediction | y | 1_b4_0_count | |
---|---|---|---|
3 | 0.99 | 1 | 1 |
2 | 0.95 | 1 | 2 |
4 | 0.80 | 1 | 3 |
1 | 0.50 | 0 | 3 |
7 | 0.44 | 0 | 3 |
5 | 0.40 | 1 | 4 |
8 | 0.20 | 0 | 4 |
0 | 0.10 | 1 | 5 |
6 | 0.03 | 0 | 5 |
Implement a version of regularized mean encoding. We will be using the data on this Kaggle compettion.
Process:
- Split data (training) into training and validation. Take the last week (7 days) of training set as validation.
- Implement regularized mean encoding for the training set using pandas.
- Implement mean encoding for the validation set
## Split train and validation
# get sample data first
path = "avazu/"
!head -100000 $path/train > $path/train_sample.csv
!head -100000 $path/test > $path/test_sample.csv
data = pd.read_csv(path + "train_sample.csv")
test = pd.read_csv(path + "test_sample.csv")
def split_based_hour(data):
""" Split data based on column hour.
Use 20% of the date for validation.
Inputs:
data: dataframe from avazu
Returns:
train:
val: 20% of the largest values of column "hour".
"""
N = int(0.8*len(data))
data = data.sort_values(by="hour")
train = data[:N].copy()
val = data[N:].copy()
return train.reset_index(), val.reset_index()
train, val = split_based_hour(data)
Here is how you do mean encoding without regularization.
# Calculate a mapping: {device_type: click_mean}
mean_device_type = train.groupby('device_type').click.mean()
mean_device_type
device_type
0 0.224277
1 0.176116
4 0.069777
5 0.083333
Name: click, dtype: float64
# This is the global click mean
global_mean = train.click.mean()
global_mean
0.17477718471480894
train["device_type_mean_enc"] = train["device_type"].map(mean_device_type)
val["device_type_mean_enc"] = val["device_type"].map(mean_device_type)
train["device_type_mean_enc"].fillna(global_mean, inplace=True)
val["device_type_mean_enc"].fillna(global_mean, inplace=True)
# Print correlation
encoded_feature = val["device_type_mean_enc"].values
print(np.corrcoef(val["click"].values, encoded_feature)[0][1])
0.0530389229998
To do mean encoding with K-fold regularization you do the following:
- Run a 5-fold split on train data where
mean_device_type
is computed on 4/5 of the data and the encoding is computed on the other 1/5. - To compute mean encoding on the validation data use the code similar to encoding without regularization. That is compute on all the training data and apply to the validation set.
from sklearn.model_selection import KFold
def reg_target_encoding(train, col = "device_type", splits=5):
""" Computes regularize mean encoding.
Inputs:
train: training dataframe
"""
train[col+"_mean_enc"] = np.nan
kf = KFold(n_splits=splits)
for trn_ind, val_ind in kf.split(train):
mean_by_col = train.loc[trn_ind].groupby(col).click.mean()
train.loc[val_ind, col+"_mean_enc"] = train.loc[val_ind][col].map(mean_by_col)
reg_target_encoding(train)
encoded_feature = train["device_type_mean_enc"].values
corr = np.corrcoef(train["click"].values, encoded_feature)[0][1]
assert(np.around(corr, decimals=4) == 0.0551)
def mean_encoding_test(test, train, col = "device_type", splits=5):
""" Computes target enconding for test data.
This is similar to how we do validation
"""
reg_target_encoding(train, col = col, splits=splits)
mean_by_col = train.groupby(col).click.mean()
test[col+"_mean_enc"] = test[col].map(mean_by_col)
mean_encoding_test(test, train)
encoded_feature_mean = test["device_type_mean_enc"].values.mean()
assert(np.around(encoded_feature_mean, decimals=4) == 0.177)
-
Implement a few more features, include:
- day of the week and hour
- mean encoding of some other features (at least two)
- use plots and
value_counts()
to understand the data
-
Fit a random forest (to the whole dataset):
- Do hyperparameter tunning using your validation set
- Report test and train log loss
def load_data():
path = "avazu/"
data = pd.read_csv(path + "train")
test = pd.read_csv(path + "test")
return data, test
types = {'id': np.uint32, 'click': np.uint8, 'hour': np.uint32, 'C1': np.uint32, 'banner_pos': np.uint32,
'site_id': 'category', 'site_domain': 'category', 'site_category': 'category', 'app_id': 'category',
'app_domain': 'category', 'app_category': 'category', 'device_id': 'category',
'device_ip': 'category', 'device_model': 'category', 'device_type': np.uint8, 'device_conn_type': np.uint8,
'C14': np.uint16, 'C15': np.uint16, 'C16': np.uint16, 'C17': np.uint16, 'C18': np.uint16, 'C19': np.uint16,
'C20': np.uint16, 'C21': np.uint16}
#data = pd.read_csv(path + "train", usecols=types.keys(), dtype=types)
data = pd.read_feather("data")
print(data.info(memory_usage='deep'))
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40428967 entries, 0 to 40428966
Data columns (total 24 columns):
id uint32
click uint8
hour uint32
C1 uint32
banner_pos uint32
site_id category
site_domain category
site_category category
app_id category
app_domain category
app_category category
device_id category
device_ip category
device_model category
device_type uint8
device_conn_type uint8
C14 uint16
C15 uint16
C16 uint16
C17 uint16
C18 uint16
C19 uint16
C20 uint16
C21 uint16
dtypes: category(9), uint16(8), uint32(4), uint8(3)
memory usage: 3.0 GB
None
# source: fastai
def add_datepart(df, fldname, drop=True):
fld = df[fldname]
if not np.issubdtype(fld.dtype, np.datetime64):
df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
targ_pre = re.sub('[Dd]ate$', '', fldname)
for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'):
df[targ_pre+n] = getattr(fld.dt,n.lower())
df[targ_pre+'Elapsed'] = fld.astype(np.int64) // 10**9
if drop: df.drop(fldname, axis=1, inplace=True)
import re
data['date'] = 2000000000 + data['hour']
data['date'] = pd.to_datetime(data['date'],format='%Y%m%d%H')
add_datepart(data, 'date')
data['hour'] = data['hour']%100
drop_list = ['id','Year', 'Month', 'Week', 'Day',
'Dayofyear','Is_month_end', 'Is_month_start', 'Is_quarter_end',
'Is_quarter_start','Is_year_end', 'Is_year_start', 'Elapsed']
data.drop(drop_list,1,inplace=True)
hash_list = ['site_id','site_domain','site_category',
'app_id','app_domain','app_category',
'device_id','device_ip','device_model']
mod_list = [10007, 10007, 97,
10007, 997, 97,
1e7+19, 1e7+19, 10007]
for i,c in enumerate(hash_list):
data[c] = data[c].map(hash).astype(np.float32)% mod_list[i]
def split_based_hour(data):
""" Split data based on column hour.
Use 20% of the date for validation.
Inputs:
data: dataframe from avazu
Returns:
train:
val: 20% of the largest values of column "hour".
"""
N = int(0.8*len(data))
train = data[:N].copy()
val = data[N:].copy()
return train.reset_index(), val.reset_index()
train, val = split_based_hour(data)
mean_encoding_test(val, train, col = "device_type")
mean_encoding_test(val, train, col = "Dayofweek")
mean_encoding_test(val, train, col = "banner_pos")
mean_encoding_test(val, train, col = "site_category")
mean_encoding_test(val, train, col = "hour")
global_mean = train.click.mean()
train.fillna(global_mean, inplace=True)
val.fillna(global_mean, inplace=True)
train.to_featherather('train')
val.to_feather('val')
%matplotlib inline
import matplotlib.pyplot as plt
df = pd.DataFrame({'mean_click_dow' : data["click"].groupby(data['Dayofweek']).mean()}).reset_index()
plt.plot(df.Dayofweek, df.mean_click_dow)
plt.show
<function matplotlib.pyplot.show>
df = pd.DataFrame({'mean_click_dow' : data["click"].groupby(data['hour']).mean()}).reset_index()
plt.plot(df.hour, df.mean_click_dow)
plt.show
<function matplotlib.pyplot.show>
data.Dayofweek.value_counts()
1 9410217
2 9169734
3 8089690
6 3835892
5 3363122
4 3335302
0 3225010
Name: Dayofweek, dtype: int64
data.banner_pos.value_counts()
0 29109590
1 11247282
7 43577
2 13001
4 7704
5 5778
3 2035
Name: banner_pos, dtype: int64
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss, auc, roc_curve
rf = RandomForestClassifier(n_jobs=-1, min_samples_leaf=10)
X_train = train.drop(['click', 'index', 'device_type', 'site_category', 'app_category', 'banner_pos'], axis=1)
y_train = train['click']
X_val = val.drop(['click', 'index', 'device_type', 'site_category', 'app_category', 'banner_pos'], axis=1)
y_val = val['click']
rf.fit(X_train, y_train)
CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 20 µs
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=10, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
pred_trn = rf.predict_proba(X_train)
pred = rf.predict_proba(X_val)
print('Training loss:',log_loss(y_train, pred_trn[:,1]) )
print('Testing loss:',log_loss(y_val, pred[:,1]) )
Training loss: 0.350652430816
Testing loss: 0.409515728913