In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import eli5
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from IPython.display import display_html
from sklearn.metrics import confusion_matrix

In [None]:
PATH_TO_DATA = r'/kaggle/input/open-ml-course-linear-models-spring22/'
SEED = 241

In [None]:
def prepare_sparse_features(path_to_train, path_to_test, path_to_site_dict,
                           vectorizer_params):
    times = ['time%s' % i for i in range(1, 11)]
    train_df = pd.read_csv(path_to_train,
                       index_col='session_id', parse_dates=times)
    test_df = pd.read_csv(path_to_test,
                      index_col='session_id', parse_dates=times)

    # Sort the data by time
    train_df = train_df.sort_values(by='time1')
    
    # read site -> id mapping provided by competition organizers 
    with open(path_to_site_dict, 'rb') as f:
        site2id = pickle.load(f)
    # create an inverse id _> site mapping
    id2site = {v:k for (k, v) in site2id.items()}
    # we treat site with id 0 as "unknown"
    id2site[0] = 'unknown'
    
    # Transform data into format which can be fed into TfidfVectorizer
    # This time we prefer to represent sessions with site names, not site ids. 
    # It's less efficient but thus it'll be more convenient to interpret model weights.
    sites = ['site%s' % i for i in range(1, 11)]
    train_sessions = train_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()
    test_sessions = test_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()
    # we'll tell TfidfVectorizer that we'd like to split data by whitespaces only 
    # so that it doesn't split by dots (we wouldn't like to have 'mail.google.com' 
    # to be split into 'mail', 'google' and 'com')
    vectorizer = TfidfVectorizer(**vectorizer_params)
    X_train = vectorizer.fit_transform(train_sessions)
    X_test = vectorizer.transform(test_sessions)
    y_train = train_df['target'].astype('int').values
    
    # we'll need site visit times for further feature engineering
    train_times, test_times = train_df[times], test_df[times]
    
    return X_train, X_test, y_train, vectorizer, train_times, test_times

### TF-IDF

In [None]:
%%time
X_train_sites, X_test_sites, y_train, vectorizer, train_times, test_times = prepare_sparse_features(
    path_to_train=os.path.join(PATH_TO_DATA, 'train.csv'),
    path_to_test=os.path.join(PATH_TO_DATA, 'test.csv'),
    path_to_site_dict=os.path.join(PATH_TO_DATA, 'site_dic.pkl'),
    vectorizer_params={'ngram_range': (1, 3),
                       'max_features': 30000, 
                       'tokenizer': lambda s: s.split()}
)

In [None]:
sites_dict = pd.read_pickle(os.path.join(PATH_TO_DATA, 'site_dic.pkl'))
sites_dict_inv = {v: k for k, v in sites_dict.items()}

sites_train = ['site%s' % i for i in range(1, 11)] + ['target']
sites_test = ['site%s' % i for i in range(1, 11)]
train_sites = pd.read_csv(os.path.join(PATH_TO_DATA, 'train.csv'),
                       index_col='session_id', parse_dates=['time%s' % i for i in range(1, 11)])
test_sites = pd.read_csv(os.path.join(PATH_TO_DATA, 'test.csv'),
                       index_col='session_id', parse_dates=['time%s' % i for i in range(1, 11)])

train_sites = train_sites.sort_values(by='time1')
train_sites = train_sites[sites_train].fillna(0).astype('int')
test_sites = test_sites[sites_test].fillna(0).astype('int')

In [None]:
features_name = []
corr_df = []

In [None]:
time_split = TimeSeriesSplit(n_splits=10)

In [None]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [None]:
def train_and_predict(model, X_train, y_train, X_test, site_feature_names=vectorizer.get_feature_names(), 
                      new_feature_names=None, cv=time_split, scoring='roc_auc',
                      top_n_features_to_show=30, submission_file_name='submission.csv'):
    
    
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, 
                            scoring=scoring, n_jobs=4)
    print('CV scores', cv_scores)
    print('CV mean: {}, CV std: {}'.format(cv_scores.mean(), cv_scores.std()))
    model.fit(X_train, y_train)
    
    if new_feature_names:
        all_feature_names = site_feature_names + new_feature_names 
    else: 
        all_feature_names = site_feature_names
    
    display_html(eli5.show_weights(estimator=model, 
                  feature_names=all_feature_names, top=top_n_features_to_show))
    
    if new_feature_names:
        print('New feature weights:')
    
        print(pd.DataFrame({'feature': new_feature_names, 
                        'coef': model.coef_.flatten()[-len(new_feature_names):]}))
        
    proba = model.predict_proba(X_train)
    predicted = model.predict(X_train)
    table_confusion = confusion_matrix(y_train, predicted)
    test_pred = model.predict_proba(X_test)[:, 1]
    write_to_submission_file(test_pred, submission_file_name) 
    
    return (proba, y_train, predicted, table_confusion, cv_scores)

# Time Features

In [None]:
session_start_hour = train_times['time1'].apply(lambda ts: 100 * ts.hour + int(ts.minute / 10)).values

In [None]:
plt.subplots(1,  figsize = (20, 7)) 

sns.countplot(pd.DataFrame(session_start_hour[y_train == 1], columns=['time1']), x='time1')
plt.title("Alice")
plt.xlabel('Session start hour')

In [None]:
def add_time_features(times, X_sparse, add_feat = True):
    hour = times['time1'].apply(lambda t: 100 * t.hour + t.minute) / 1000
    morning_1 = (((hour >= 0.901) & (hour <= 0.904) | (hour >= 0.922) & (hour <= 1.209)).astype('int') * hour).values.reshape(-1, 1)
    morning_2 = (((hour >= 0.905) & (hour <= 0.921)).astype('int') * hour).values.reshape(-1, 1)
    day_1 = (((hour >= 1.210) & (hour <= 1.239)).astype('int') * hour).values.reshape(-1, 1)
    day_2 = (((hour >= 1.240) & (hour <= 1.335)).astype('int') * hour).values.reshape(-1, 1)
    day_3 = (((hour >= 1.336) & (hour <= 1.358)).astype('int') * hour).values.reshape(-1, 1)
    day_4 = (((hour >= 1.359) & (hour <= 1.517)).astype('int') * hour).values.reshape(-1, 1)
    day_5 = (((hour >= 1.518) & (hour <= 1.553)).astype('int') * hour).values.reshape(-1, 1)
    evening_1 = (((hour >= 1.554) & (hour <= 1.629) | (hour >= 1.705) & (hour <= 1.755)) * hour).values.reshape(-1, 1)
    evening_2 = ((hour >= 1.653) & (hour <= 1.704)).values.reshape(-1, 1)
    evening_3 = (((hour >= 1.756) & (hour <= 1.828) | (hour >= 1.626) & (hour <= 1.656)) * hour).values.reshape(-1, 1)
    night = (((hour >= 1.829) & (hour <= 2.359) | (hour >= 0) & (hour <= 0.900)) * hour).values.reshape(-1, 1)
    
    objects_to_hstack = [X_sparse, morning_1, morning_2, day_1, day_2, day_3, day_4, day_5, evening_1, evening_2, evening_3,night] # 
    feature_names = ['morning_1', 'morning_2', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5',  'evening_1', 'evening_2','evening_3', 'night'] #
    
    if add_feat:
        for i,j in zip(objects_to_hstack[1:], feature_names):
            feat = pd.DataFrame(pd.DataFrame(i, columns = [j]))
            corr_df.append(feat)
                            
    X = hstack(objects_to_hstack)
    return X, feature_names

In [None]:
X_train_final, new_feat_names = add_time_features(train_times, X_train_sites)
X_test_final, _ = add_time_features(test_times, X_test_sites, add_feat = False)

In [None]:
features_name += new_feat_names

In [None]:
dow = train_times['time1'].apply(lambda x: x.weekday())

In [None]:
plt.subplots(1, figsize = (16, 8)) 

sns.countplot(pd.DataFrame(dow[y_train == 1]), x='time1')
plt.title("Alice")
plt.xlabel('Session start hour')

In [None]:
def add_day_month(times, X_sparse, add_feat = True):
    
    day_of_week = times['time1'].apply(lambda t: t.weekday())
    day_of_week_df = pd.get_dummies(day_of_week)
    day_of_week_df['5_6'] = day_of_week_df[5] + day_of_week_df[6]
    day_of_week_df['2_3'] = day_of_week_df[2] + day_of_week_df[3]
    
    for d in (2,3,5,6):
        del day_of_week_df[d]
    
    day_of_week_df = day_of_week_df.rename({i: 'weekday_' + str(i) for i in day_of_week_df.columns}, axis = 1)
    
    objects_to_hstack = [X_sparse, day_of_week_df]
    feature_names = ['weekday_' + str(i) for i in day_of_week_df.columns]
    if add_feat:
        corr_df.append(day_of_week_df.reset_index(drop=True))
        
    X = hstack(objects_to_hstack)
    return X, feature_names

In [None]:
X_train_final, more_feat_names = add_day_month(train_times, X_train_final)
X_test_final, _ = add_day_month(test_times, X_test_final, add_feat = False)

In [None]:
features_name += more_feat_names

In [None]:
dom = train_times['time1'].apply(lambda ts: ts.day)

In [None]:
plt.subplots(1, 2, figsize = (16, 8)) 

plt.subplot(1, 2, 1)
sns.countplot(pd.DataFrame(dom[(y_train == 1) ]), x='time1')
plt.title("Alice")
plt.xlabel('Day of month')
          
plt.subplot(1, 2, 2)
sns.countplot(pd.DataFrame(dom[(y_train == 0) ]), x='time1')
plt.title('Intruder')
plt.xlabel('Day of month');

In [None]:
def add_dom(times, X_sparse, add_feat = True):
    
    dom = times['time1'].apply(lambda ts: ts.day)
    dom_1 = (dom.isin([3,5,6,7,8,10,11,12,21,23,27,28,30])).values.reshape(-1, 1)
    dom_2 = (dom.isin([9,24])).values.reshape(-1, 1)
    dom_3 = (dom.isin([17,18,19,20,21,22,24,25,26,31])).values.reshape(-1, 1)
    
    objects_to_hstack = [X_sparse,  dom_1, dom_2, dom_3]
    feature_names = ['dom_1', 'dom_2', 'dom_3']   
    
    if add_feat:
        corr_df.append(pd.DataFrame(dom_1, columns = ['dom_1']))
        corr_df.append(pd.DataFrame(dom_2, columns = ['dom_2']))
        corr_df.append(pd.DataFrame(dom_3, columns = ['dom_3']))
        
    X = hstack(objects_to_hstack)
    return X, feature_names

In [None]:
X_train_final, dom_features = add_dom(train_times, X_train_final)
X_test_final, _ = add_dom(test_times, X_test_final, add_feat = False)
features_name += dom_features

In [None]:
final_model = LogisticRegression(C=20, random_state=SEED, solver='liblinear')

In [None]:
proba, ideal, predicted, confusion_matrix, cv_scores = train_and_predict(model=final_model, X_train=X_train_final, y_train=y_train, 
                               X_test=X_test_final, 
                               site_feature_names=vectorizer.get_feature_names(),
                               new_feature_names=features_name,
                               cv=time_split, submission_file_name='submission.csv')

The model gives a result of 0.96878