See https://www.kaggle.com/kashnitsky/correct-time-aware-cross-validation-scheme/notebook

In [124]:
# Import libraries and set desired options
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, RobustScaler, Normalizer

In [2]:
# A helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

Read training and test sets, sort train set by session start time.

In [3]:
times = ['time%s' % i for i in range(1, 11)]
train_df = pd.read_csv('../../../data/train_sessions.csv',
                       index_col='session_id', parse_dates=times)
test_df = pd.read_csv('../../../data/test_sessions.csv',
                      index_col='session_id', parse_dates=times)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

# Randomize column order
#train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Look at the first rows of the training set
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [4]:
#times = ['time%s' % i for i in range(1, 11)]
#train_df = pd.read_csv('../../../data/train_sessions.csv',
#                       index_col='session_id', parse_dates=['time1'])
#test_df = pd.read_csv('../../../data/test_sessions.csv',
#                      index_col='session_id', parse_dates=['time1'])

# Sort the data by time
#train_df = train_df.sort_values(by='time1')

# Look at the first rows of the training set
#train_df.head()

#### Transform data into format which can be fed into CountVectorizer

In [5]:
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites].fillna(0).astype('int').to_csv('train_sessions_text.txt', 
                                               sep=' ', 
                       index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('test_sessions_text.txt', 
                                              sep=' ', 
                       index=None, header=None)

In [6]:
!head -5 train_sessions_text.txt

56 55 0 0 0 0 0 0 0 0
56 55 56 55 0 0 0 0 0 0
946 946 951 946 946 945 948 784 949 946
945 948 949 948 945 946 947 945 946 946
947 950 948 947 950 952 946 951 946 947


#### Fit CountVectorizer and transform data with it.

In [7]:
%%time
#cv = CountVectorizer(ngram_range=(1, 3), max_features=50000)
cv = TfidfVectorizer(ngram_range=(1, 3), max_features=25000)

with open('train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
X_train.shape, X_test.shape

CPU times: user 10.2 s, sys: 218 ms, total: 10.4 s
Wall time: 8.69 s


#### Save train targets into a separate vector.

In [8]:
y_train = train_df['target'].astype('int').values

#### We'll be performing time series cross-validation

We will be performing time series cross-validation, see sklearn TimeSeriesSplit and this dicussion on StackOverflow.

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html

https://stats.stackexchange.com/questions/14099/using-k-fold-cross-validation-for-time-series-model-selection

In [9]:
time_split = TimeSeriesSplit(n_splits=10)

In [10]:
[(el[0].shape, el[1].shape) for el in time_split.split(X_train)]

[((23051,), (23051,)),
 ((46102,), (23051,)),
 ((69153,), (23051,)),
 ((92204,), (23051,)),
 ((115255,), (23051,)),
 ((138306,), (23051,)),
 ((161357,), (23051,)),
 ((184408,), (23051,)),
 ((207459,), (23051,)),
 ((230510,), (23051,))]

#### Perform time series cross-validation with logistic regression.

In [11]:
logit2 = LogisticRegression(C=1, random_state=17, solver='liblinear')

In [12]:
#logit2 = LogisticRegression(C=1, random_state=17, solver='lbfgs')

In [13]:
%%time
cv_scores = cross_val_score(logit2, X_train, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=4)

CPU times: user 56 ms, sys: 76.3 ms, total: 132 ms
Wall time: 2.91 s


In [14]:
cv_scores, cv_scores.mean() # 0.86759612238438

(array([0.83056662, 0.65513859, 0.87918137, 0.934713  , 0.85085091,
        0.88727046, 0.92170489, 0.87437891, 0.92875908, 0.92213521]),
 0.8684699030562978)

#### Now we'll add some time features: indicators of morning, day, evening and night.

In [15]:
def add_time_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse,
                morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1),
                evening.values.reshape(-1, 1),
                night.values.reshape(-1, 1)
                ])
    return X

In [16]:
%%time
X_train_new = add_time_features(train_df, X_train)
X_test_new = add_time_features(test_df, X_test)

CPU times: user 2.32 s, sys: 151 ms, total: 2.47 s
Wall time: 1.2 s


In [17]:
X_train_new.shape, X_test_new.shape

((253561, 25004), (82797, 25004))

In [18]:
%%time
cv_scores = cross_val_score(logit2, X_train_new, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=4)

CPU times: user 741 ms, sys: 54.6 ms, total: 796 ms
Wall time: 2.83 s


In [19]:
cv_scores, cv_scores.mean() # 0.9242653781514694

(array([0.88175786, 0.8055756 , 0.93038788, 0.96538687, 0.91704813,
        0.95117894, 0.94760138, 0.93890404, 0.95189639, 0.95038456]),
 0.924012164292004)

#### One hot encode the year-month

Not using, seems to lower the cv score

In [20]:
def add_month_categories_feature(df, X_sparse):
    foo = pd.DataFrame(index=df.index)
    foo['start_month_category'] = train_df['time1'].apply(lambda ts: 
                                                      100 * ts.year + ts.month).astype('category')
    
    onehotencoder = OneHotEncoder(categories='auto')
    foo = onehotencoder.fit_transform(foo).toarray()

    X = hstack([X_sparse, foo])
    return X
    

In [21]:
%%time
X_train_newa = add_month_categories_feature(train_df, X_train_new)
X_test_newa = add_month_categories_feature(test_df, X_test_new)

CPU times: user 2.73 s, sys: 93.8 ms, total: 2.82 s
Wall time: 1.75 s


In [22]:
X_train_newa.shape, X_test_newa.shape

((253561, 25020), (82797, 25020))

In [23]:
%%time
cv_scores = cross_val_score(logit2, X_train_newa, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=4)

CPU times: user 924 ms, sys: 57.6 ms, total: 982 ms
Wall time: 3.73 s


In [24]:
cv_scores, cv_scores.mean() # DOWN from 0.9242653781514694

(array([0.86728402, 0.82995865, 0.91167075, 0.96715622, 0.91780679,
        0.94957089, 0.95071005, 0.9365755 , 0.94620939, 0.95139473]),
 0.9228337004673763)

#### Add start month feature

In [25]:
def add_start_month_feature(df, X_sparse):
    foo = pd.DataFrame(index=df.index)
    bar = df['time1'].apply(lambda ts: 100 * ts.year + ts.month).astype('float64')
    foo['scaled_month'] = StandardScaler().fit_transform(bar.values.reshape(-1, 1))
    
    X = hstack([X_sparse, foo])
    return X

In [26]:
%%time
X_train_new2 = add_start_month_feature(train_df, X_train_new)
X_test_new2 = add_start_month_feature(test_df, X_test_new)

CPU times: user 2.28 s, sys: 38.3 ms, total: 2.31 s
Wall time: 1.17 s


In [27]:
X_train_new2.shape, X_test_new2.shape

((253561, 25005), (82797, 25005))

In [28]:
%%time
cv_scores = cross_val_score(logit2, X_train_new2, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=4)

CPU times: user 811 ms, sys: 55.4 ms, total: 867 ms
Wall time: 3.37 s


In [29]:
cv_scores, cv_scores.mean() # 0.9258065597138293

(array([0.88256176, 0.8090879 , 0.95396728, 0.96740806, 0.91783659,
        0.9509222 , 0.95009877, 0.93827155, 0.95146144, 0.95042744]),
 0.9272042993049274)

#### Add session length feature

In [118]:
def add_session_length_feature(df, X_sparse):    
    foo = pd.DataFrame(index=df.index)
    foo['min_seconds'] = df[times].min(axis=1)
    foo['max_seconds'] = df[times].max(axis=1)
    foo['seconds'] = (foo['max_seconds'] - foo['min_seconds']) / np.timedelta64(1, 's')

    #foo['scaled_session_duration_seconds'] = Normalizer().fit_transform(foo['seconds'].values.reshape(-1, 1))
    foo['scaled_session_duration_seconds'] = StandardScaler().fit_transform(foo['seconds'].values.reshape(-1, 1))
    #foo['scaled_session_duration_seconds'] = RobustScaler().fit_transform(foo['seconds'].values.reshape(-1, 1))
    #print(foo['scaled_session_duration_seconds'].mean())

    #foo['lt_40_secs'] = (foo['seconds'] < 40)
    #foo['lt_40_secs'] = foo['lt_40_secs'].astype('float64')
    
    foo = foo.drop(columns=['min_seconds', 'max_seconds', 'seconds'])
    
    X = hstack([X_sparse, foo])
    return X
    #return foo

In [119]:
%%time
X_train_new3 = add_session_length_feature(train_df, X_train_new2)
X_test_new3 = add_session_length_feature(test_df, X_test_new2)

CPU times: user 535 ms, sys: 55.4 ms, total: 591 ms
Wall time: 166 ms


In [120]:
X_train_new3.shape, X_test_new3.shape

((253561, 25006), (82797, 25006))

In [121]:
%%time
cv_scores = cross_val_score(logit2, X_train_new3, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=4)

CPU times: user 153 ms, sys: 49.5 ms, total: 203 ms
Wall time: 3.72 s


In [122]:
cv_scores, cv_scores.mean() # 0.927079412339743

(array([0.87020336, 0.81228317, 0.95418922, 0.96781729, 0.91913952,
        0.95233405, 0.95190445, 0.9392787 , 0.95284822, 0.95079614]),
 0.927079412339743)

#### Add day of week feature

Does not seem to help. Also tried limiting it to day 2, but hurt more than helped.

In [83]:
def add_day_of_week_feature(df, X_sparse):
    foo = pd.DataFrame(index=df.index)
    foo['day_of_week'] = train_df['time1'].apply(lambda ts: ts.dayofweek).astype('int')

    #foo['day_of_week'] = (foo['day_of_week'] == 6)
    
    onehotencoder = OneHotEncoder(categories='auto')
    foo = onehotencoder.fit_transform(foo).toarray()

    X = hstack([X_sparse, foo])
    return X
    

In [84]:
%%time
X_train_new4 = add_day_of_week_feature(train_df, X_train_new3)
X_test_new4 = add_day_of_week_feature(test_df, X_test_new3)

CPU times: user 2.81 s, sys: 118 ms, total: 2.93 s
Wall time: 1.85 s


In [85]:
X_train_new4.shape, X_test_new4.shape

((253561, 25013), (82797, 25013))

In [86]:
%%time
cv_scores = cross_val_score(logit2, X_train_new4, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=4)

CPU times: user 164 ms, sys: 55.1 ms, total: 219 ms
Wall time: 4.66 s


In [87]:
cv_scores, cv_scores.mean() # 0.9089179161064864

(array([0.80252005, 0.86690179, 0.8867947 , 0.97702662, 0.93897972,
        0.97181253, 0.9365308 , 0.95907612, 0.78338988, 0.96614697]),
 0.9089179161064864)

#### Add top Alice site feature

In [125]:
# Load websites dictionary
with open(r"../../../data/site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

# Create dataframe for the dictionary
sites_dict = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), columns=['site'])
print(u'Websites total:', sites_dict.shape[0])
sites_dict.head()

Websites total: 48371


Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


In [313]:
# Can probably delete this
#fr_sites_dict = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), columns=['site'])
#print(u'Fr Websites total:', fr_sites_dict.shape[0])
#fr_sites_dict.head()

is_fr_sites_dict = pd.DataFrame(index=sites_dict.index)
is_fr_sites_dict['site'] = sites_dict['site'].str.endswith('.fr').astype('float64')

is_fr_sites_dict.head()

Unnamed: 0,site
25075,0.0
13997,0.0
42436,0.0
30911,0.0
8104,0.0


In [262]:
sites_dict[sites_dict.index == 25075]['site'].values[0].endswith('.fr')

False

In [241]:
alice_sites = train_df[train_df['target'] == 1]
top_alice_sites = pd.Series(alice_sites.values.flatten()).value_counts().sort_values(ascending=False).head(50)
top_alice_sites_set = set(top_alice_sites)
len(top_alice_sites_set)

45

In [346]:
def get_foo(x):
    #print(sites_dict[sites_dict.index == x]['site'].values)
    z = is_fr_sites_dict[is_fr_sites_dict.index == x]['site'].values
    if (len(z) == 0):
        return 0.0
    return z[0]
    

def add_is_top_site_feature(df, X_sparse):
    foo = pd.DataFrame(index=df.index)
    
    foo[sites] = df[sites].fillna(0).astype(np.uint16)
    foo[sites] = foo[sites].applymap(get_foo)
    #foo['site1'] = df['site1'].fillna(0).astype(np.uint16)
    #foo['site1'] = foo['site1'].apply(get_foo)

    X = hstack([X_sparse, foo])
    return X

In [347]:
%%time
X_train_new4 = add_is_top_site_feature(train_df, X_train_new3)
X_test_new4 = add_is_top_site_feature(test_df, X_test_new3)

CPU times: user 14min 39s, sys: 1 s, total: 14min 40s
Wall time: 18min 44s


In [348]:
X_train_new4.shape, X_test_new4.shape

((253561, 25016), (82797, 25016))

In [349]:
%%time
cv_scores = cross_val_score(logit2, X_train_new4, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=4)

CPU times: user 966 ms, sys: 95.4 ms, total: 1.06 s
Wall time: 6.24 s


In [350]:
cv_scores, cv_scores.mean()

(array([0.84342366, 0.82640846, 0.95151795, 0.96595675, 0.92411766,
        0.94878433, 0.95101408, 0.936427  , 0.95086991, 0.95262085]),
 0.9251140646629707)

In [304]:
#def get_foo(x):
#    return sites_dict[sites_dict.index == x]['site'].values[0].endswith('.fr')

def get_foo(x):
    #print(sites_dict[sites_dict.index == x]['site'].values)
    z = sites_dict[sites_dict.index == x]['site'].values
    if (len(z) == 0):
        return 0
    
    return z[0].endswith('fr')
    #vfunc = np.vectorisites_dict[sites_dict.index == x]['site'].valuesze(lambda y: y.endswith('fr'))
    #print (vfunc())
    
    #z = sites_dict[sites_dict.index == x]['site'].values.vectorize(lambda y: y.endswith('fr'))
    #return z
    #return sites_dict[sites_dict.index == x]['site'].values[0].endswith('.fr')

def add_is_top_site_feature(df, X_sparse):
    foo = pd.DataFrame(index=df.index)
    
    foo[sites] = df[sites].fillna(0).astype(np.uint16)
    
    #foo[sites] = foo[sites].isin(top_alice_sites_set).astype('float64')

    #sites_dict[sites_dict.index == 25075]['site'].values[0].endswith('.fr')
    #foo[sites] = foo[sites].applymap(lambda x: sites_dict[sites_dict.index == x]['site'].values[0].endswith('.fr'))
    #foo[sites] = foo[sites].applymap(lambda x: sites_dict[sites_dict.index == x]['site'].values[0].endswith('.fr'))
    
    #foo[sites] = foo[sites].apply(lambda x: sites_dict[sites_dict.index == x]['site'].values[0].endswith('.fr'))
    
    foo[sites] = foo[sites].applymap(get_foo)
    print(foo)

    
    #print(foo)
    #return foo
    X = hstack([X_sparse, foo])
    return X

In [305]:
%%time
X_train_new4 = add_is_top_site_feature(train_df, X_train_new3)
X_test_new4 = add_is_top_site_feature(test_df, X_test_new3)

KeyboardInterrupt: 

In [244]:
X_train_new4.shape, X_test_new4.shape

((253561, 25016), (82797, 25016))

In [245]:
%%time
cv_scores = cross_val_score(logit2, X_train_new4, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=4)

CPU times: user 160 ms, sys: 62.2 ms, total: 222 ms
Wall time: 4.93 s


In [246]:
cv_scores, cv_scores.mean()

(array([0.86647533, 0.81369224, 0.95455752, 0.96761322, 0.91887048,
        0.95220944, 0.95167442, 0.9399459 , 0.95232555, 0.95038938]),
 0.9267753481198181)

#### Now we tune regularization parameter C.

In [35]:
c_values = np.logspace(-2, 2, 10)

logit_grid_searcher = GridSearchCV(estimator=logit2, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=1, cv=time_split, verbose=1)

In [36]:
%%time
logit_grid_searcher.fit(X_train_new3, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.9min finished


CPU times: user 11min 13s, sys: 3.94 s, total: 11min 17s
Wall time: 1min 56s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=1,
       param_grid={'C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [37]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_ # 0.9277596037183831

(0.9279511791368529, {'C': 1.6681005372000592})

In [None]:
logit_test_pred3 = logit_grid_searcher.predict_proba(X_test_new3)[:, 1]
write_to_submission_file(logit_test_pred3, 'submissions/09-subm1.csv') # aaa0.94386