In [10]:
%cd /notebooks
datadir = "data/kaggle_catch-me-if-you-can/"

/notebooks


In [30]:
from __future__ import division, print_function
import warnings
warnings.filterwarnings('ignore')
import os
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score
from sklearn import feature_extraction
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack


In [11]:
train = pd.read_csv(datadir + 'train_sessions.csv', index_col='session_id')
test = pd.read_csv(os.path.join(datadir, 'test_sessions.csv'), index_col='session_id')

In [12]:
# combine the two
sum_data = pd.concat([train, test])

In [13]:
# misc pre-processing
site_cols = ['site%d' % i for i in range(1, 11)]
time_cols = ['time%d' % i for i in range(1, 11)]

sum_data[site_cols] = sum_data[site_cols].fillna(0).astype(np.int).astype(np.str)
sum_data[time_cols] = sum_data[time_cols].apply(pd.to_datetime)

In [14]:
sum_data

Unnamed: 0_level_0,site1,site10,site2,site3,site4,site5,site6,site7,site8,site9,...,time1,time10,time2,time3,time4,time5,time6,time7,time8,time9
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,0,0,0,0,0,0,0,0,0,...,2014-02-20 10:02:45,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
2,890,1518,941,3847,941,942,3846,3847,3846,1516,...,2014-02-22 11:19:50,2014-02-22 11:20:16,2014-02-22 11:19:50,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:52,2014-02-22 11:19:52,2014-02-22 11:20:15
3,14769,14768,39,14768,14769,37,39,14768,14768,14768,...,2013-12-16 16:40:17,2013-12-16 16:40:24,2013-12-16 16:40:18,2013-12-16 16:40:19,2013-12-16 16:40:19,2013-12-16 16:40:19,2013-12-16 16:40:19,2013-12-16 16:40:20,2013-12-16 16:40:21,2013-12-16 16:40:22
4,782,782,782,782,782,782,782,782,782,782,...,2014-03-28 10:52:12,2014-03-28 10:56:42,2014-03-28 10:52:42,2014-03-28 10:53:12,2014-03-28 10:53:42,2014-03-28 10:54:12,2014-03-28 10:54:42,2014-03-28 10:55:12,2014-03-28 10:55:42,2014-03-28 10:56:12
5,22,178,177,175,178,177,178,175,177,177,...,2014-02-28 10:53:05,2014-02-28 10:57:11,2014-02-28 10:55:22,2014-02-28 10:55:22,2014-02-28 10:55:23,2014-02-28 10:55:23,2014-02-28 10:55:59,2014-02-28 10:55:59,2014-02-28 10:55:59,2014-02-28 10:57:06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82793,812,0,1039,676,0,0,0,0,0,0,...,2014-10-02 18:20:09,NaT,2014-10-02 18:20:09,2014-10-02 18:20:09,NaT,NaT,NaT,NaT,NaT,NaT
82794,300,1216,302,302,300,300,1222,302,1218,1221,...,2014-05-26 14:16:40,2014-05-26 14:17:19,2014-05-26 14:16:41,2014-05-26 14:16:44,2014-05-26 14:16:44,2014-05-26 14:17:19,2014-05-26 14:17:19,2014-05-26 14:17:19,2014-05-26 14:17:19,2014-05-26 14:17:19
82795,29,6780,33,35,22,37,6779,30,21,23,...,2014-05-02 11:21:56,2014-05-02 11:22:04,2014-05-02 11:21:56,2014-05-02 11:21:56,2014-05-02 11:22:03,2014-05-02 11:22:03,2014-05-02 11:22:03,2014-05-02 11:22:03,2014-05-02 11:22:04,2014-05-02 11:22:04
82796,5828,961,23,21,804,21,3350,23,894,21,...,2014-05-03 10:05:25,2014-05-03 10:05:38,2014-05-03 10:05:27,2014-05-03 10:05:27,2014-05-03 10:05:27,2014-05-03 10:05:36,2014-05-03 10:05:37,2014-05-03 10:05:37,2014-05-03 10:05:38,2014-05-03 10:05:38


In [16]:
# create a discharged matrix in the form of a "word bag" on the sites
def join_str(row):
    return ' '.join(row)

site_text_data = sum_data[site_cols].apply(join_str, axis=1)
print('Number of sessions: {}'.format(site_text_data.shape[0]))

Number of sessions: 336358


In [18]:
vectorizer = feature_extraction.text.TfidfVectorizer()
sum_data_site_sparse = vectorizer.fit_transform(site_text_data)

In [19]:
print('Sparse matrix dimensions: {}'.format(sum_data_site_sparse.shape))

Matrix dimensions: (336358, 48362)


### Create new properties
#### session_timespan
#### unique_sites
#### day_of_week
#### start_hour

In [20]:
def calc_session_timespan(row):
    timestamps = row[time_cols].values
    session_timespan = timestamps.max() - timestamps.min()
    
    return session_timespan.total_seconds()

def calc_unique_sites(row):
    sites_vals = row[site_cols].values
    
    return len(np.unique([a for a in sites_vals if int(a) > 0]))
    
def calc_day_of_week(row):
    timestamps = row[time_cols].values    
    return timestamps.min().weekday()

def calc_start_hour(row):
    timestamps = row[time_cols].values    
    return timestamps.min().hour

def calc_end_hour(row):
    timestamps = row[time_cols].values    
    return timestamps.max().hour

def calc_day_of_month(row):
    timestamps = row[time_cols].values    
    return timestamps.min().day

def calc_month(row):
    timestamps = row[time_cols].values    
    return timestamps.min().month

def calc_is_weekend(row):
    day_of_week = row['day_of_week']
    if day_of_week == 6 or day_of_week == 5:
        return 1
    
    return 0

#### next two run long!

In [21]:
%%time
sum_data['unique_sites'] = sum_data.apply(calc_unique_sites, axis=1)
sum_data['session_timespan'] = sum_data.apply(calc_session_timespan, axis=1)
sum_data['day_of_week'] = sum_data.apply(calc_day_of_week, axis=1)
sum_data['start_hour'] = sum_data.apply(calc_start_hour, axis=1)

CPU times: user 29min 26s, sys: 11.4 s, total: 29min 38s
Wall time: 29min 44s


In [22]:
%%time
sum_data['end_hour'] = sum_data.apply(calc_end_hour, axis=1)
sum_data['month'] = sum_data.apply(calc_month, axis=1)
sum_data['day_of_month'] = sum_data.apply(calc_day_of_month, axis=1)
sum_data['is_weekend'] = sum_data.apply(calc_is_weekend, axis=1)

CPU times: user 15min 40s, sys: 3.29 s, total: 15min 43s
Wall time: 15min 48s


In [23]:
# identify bad data
def print_empty_cell(collection, name):
    total_row = collection.shape[0]
    data_count = collection.count().sort_values(ascending=True)

    i = 0
    str_val = []
    for item, value in data_count.items():
        if value < total_row:
            str_val.append("{}:{}".format(item, total_row - value))
            i += 1

    if i > 0:
        print("--> invalid features in {}:".format(name))
        for s in str_val:
            print(s)
    else:
        print("--> success data in {}:".format(name))
        
print_empty_cell(sum_data, 'sum_data')

--> invalid features in sum_data:
target:82797
month:30500
end_hour:30500
start_hour:30500
day_of_week:30500
session_timespan:30500
day_of_month:30500
time10:30500
time9:27790
time8:24983
time7:22221
time6:19297
time5:16188
time4:12855
time3:9364
time2:4952


In [24]:
# clean it and take a look
sum_data['day_of_week'] = sum_data['day_of_week'].fillna(round(sum_data['day_of_week'].mean())).astype(np.int)
sum_data['start_hour'] = sum_data['start_hour'].fillna(round(sum_data['start_hour'].mean())).astype(np.int)
sum_data['end_hour'] = sum_data['end_hour'].fillna(round(sum_data['end_hour'].mean())).astype(np.int)
sum_data['month'] = sum_data['month'].fillna(round(sum_data['month'].mean())).astype(np.int)
sum_data['day_of_month'] = sum_data['day_of_month'].fillna(round(sum_data['day_of_month'].mean())).astype(np.int)
sum_data['session_timespan'] = sum_data['session_timespan'].fillna(round(sum_data['session_timespan'].mean())).astype(np.int)
sum_data['start_site'] = sum_data['site1'].astype(np.int)
sum_data['is_weekend'] = sum_data['is_weekend'].fillna(round(sum_data['is_weekend'].mean())).astype(np.int)
sum_data

#### Create categoricals

In [25]:
sum_data.shape

(336358, 30)

In [26]:
sum_data = pd.get_dummies(sum_data, columns=['day_of_week', 'start_hour', 'end_hour', 'month', 'day_of_month'])
sum_data.shape

(336358, 103)

In [27]:
day_of_week_cols = sum_data.filter(like='day_of_week').columns
start_hour_cols = sum_data.filter(like='start_hour').columns
end_hour_cols = sum_data.filter(like='end_hour').columns
day_of_month_cols = sum_data.filter(like='day_of_month').columns
month_cols = ['month_1','month_2','month_3','month_4','month_5','month_6',
              'month_7','month_8','month_9','month_10','month_11','month_12']
print("day_of_week_cols\n", day_of_week_cols)
print("start_hour_cols\n", start_hour_cols)
print("end_hour_cols\n", end_hour_cols)
print("day_of_month_cols\n", day_of_month_cols)
print("month_cols\n", month_cols)

Index(['day_of_week_0', 'day_of_week_1', 'day_of_week_2', 'day_of_week_3',
       'day_of_week_4', 'day_of_week_5', 'day_of_week_6'],
      dtype='object')
Index(['start_hour_7', 'start_hour_8', 'start_hour_9', 'start_hour_10',
       'start_hour_11', 'start_hour_12', 'start_hour_13', 'start_hour_14',
       'start_hour_15', 'start_hour_16', 'start_hour_17', 'start_hour_18',
       'start_hour_19', 'start_hour_20', 'start_hour_21', 'start_hour_22',
       'start_hour_23'],
      dtype='object')
Index(['end_hour_7', 'end_hour_8', 'end_hour_9', 'end_hour_10', 'end_hour_11',
       'end_hour_12', 'end_hour_13', 'end_hour_14', 'end_hour_15',
       'end_hour_16', 'end_hour_17', 'end_hour_18', 'end_hour_19',
       'end_hour_20', 'end_hour_21', 'end_hour_22', 'end_hour_23'],
      dtype='object')
Index(['day_of_month_1', 'day_of_month_2', 'day_of_month_3', 'day_of_month_4',
       'day_of_month_5', 'day_of_month_12', 'day_of_month_13',
       'day_of_month_14', 'day_of_month_15', 'day_of_mo

In [28]:
# more features
additional_cols = np.hstack((['unique_sites', 'start_site', 'session_timespan', 'is_weekend'],
                             day_of_week_cols,
                             start_hour_cols,
                             end_hour_cols,
                             day_of_month_cols,
                             month_cols
                            ))

In [29]:
# write temp
# sum_data.to_csv("sum_data.csv")

In [31]:
standard_scaler = StandardScaler()
scaler_sum_data_2 = standard_scaler.fit_transform(sum_data[additional_cols])

CPU times: user 1.19 s, sys: 839 ms, total: 2.03 s
Wall time: 2.04 s


In [33]:
%%time
additional_data = csr_matrix(scaler_sum_data_2)
print('additional_data shape: {}'.format(additional_data.shape))
print('sum_data_site_sparse shape: {}'.format(sum_data_site_sparse.shape))
print('sum_data shape: {}'.format(sum_data.shape))

additional_data shape: (336358, 82)
sum_data_site_sparse shape: (336358, 48362)
sum_data shape: (336358, 103)
CPU times: user 1.9 s, sys: 2.1 s, total: 4 s
Wall time: 5.01 s


## blows kernel

In [None]:
# Combine additional_data, sum_data_site_sparse and select the training and test sets
temp = hstack((sum_data_site_sparse, additional_data))
print('combined shape: {}'.format(temp.shape))

In [None]:
X_train = temp.tocsc()[:train.shape[0]]
y_train = train['target']
X_test = temp.tocsc()[train.shape[0]:]

print("X_train.shape =", X_train.shape, "y_train.shape =", y_train.shape,
      "X_test.shape =", X_test.shape)

X_train_tmp, X_test_tmp, y_train_tmp, y_test_tmp = train_test_split(X_train, y_train, test_size=0.3, random_state=17)
print("X_train_tmp.shape =", X_train_tmp.shape, "y_train_tmp.shape =", y_train_tmp.shape,
      "X_test_tmp.shape =", X_test_tmp.shape, "y_test_tmp.shape =", y_test_tmp.shape)

sgd_logit = SGDClassifier(loss='log', random_state=17, n_jobs=-1)

sgd_logit.fit(X_train_tmp, y_train_tmp)

y_pred = sgd_logit.predict_proba(X_test_tmp)[:, 1]

roc = roc_auc_score(y_test_tmp, y_pred)
print('SGDClassifier ROC AUC: {}'.format(round(roc, 4))) # ROC AUC: 0.9705

sgd_logit.fit(X_train, y_train)
y_pred = sgd_logit.predict_proba(X_test)[:, 1]

def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

write_to_submission_file(y_pred, catchmedir + 'SGDClassifier_y_pred.CSV') # 0.85692 on kaggle

reg_logit = LogisticRegression(random_state=17, n_jobs=-1, max_iter=200)
reg_logit.fit(X_train_tmp, y_train_tmp)

y_pred = reg_logit.predict_proba(X_test_tmp)[:, 1]

roc = roc_auc_score(y_test_tmp, y_pred)

print('LogisticRegression ROC AUC: {}'.format(round(roc, 4))) # ROC AUC: 0.9882

reg_logit.fit(X_train, y_train)
y_pred = reg_logit.predict_proba(X_test)[:, 1]

write_to_submission_file(y_pred, catchmedir + 'LogisticRegression_y_pred.CSV') # 0.88027 on kaggle
