In [2]:
#main
import pandas as pd
import numpy as np

#plot
from matplotlib import pyplot as plt
import seaborn as sns

#dop
import re
import pickle
from tqdm import tqdm
from tqdm.notebook import tqdm
tqdm.pandas()
from IPython.display import display
from datetime import datetime, timedelta, date, time

#prepprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

#ML
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

#metrics 
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

# option
pd.set_option("display.max_columns", 300)
plt.rcParams['figure.figsize'] = (15, 10)

PATH_TRAIN = 'data/train.csv'
PATH_TASK = 'data/test.csv'
TARGET_COL = 'target'

In [3]:
time_col = ['time' + str(i) for i in range(1, 11)]
sites = ['site' + str(i) for i in range(1,11)]

train = pd.read_csv(PATH_TRAIN, parse_dates=time_col, index_col='session_id')
task = pd.read_csv(PATH_TASK, parse_dates=time_col, index_col='session_id')
y_train = train[[TARGET_COL]]

print(train.shape, task.shape)
print('-'*50)
train.info()
train.head()

(253561, 21) (82797, 20)
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 253561 entries, 1 to 253561
Data columns (total 21 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   site1   253561 non-null  int64         
 1   time1   253561 non-null  datetime64[ns]
 2   site2   250098 non-null  float64       
 3   time2   250098 non-null  datetime64[ns]
 4   site3   246919 non-null  float64       
 5   time3   246919 non-null  datetime64[ns]
 6   site4   244321 non-null  float64       
 7   time4   244321 non-null  datetime64[ns]
 8   site5   241829 non-null  float64       
 9   time5   241829 non-null  datetime64[ns]
 10  site6   239495 non-null  float64       
 11  time6   239495 non-null  datetime64[ns]
 12  site7   237297 non-null  float64       
 13  time7   237297 non-null  datetime64[ns]
 14  site8   235224 non-null  float64       
 15  time8   235224 non-null  datetime64[ns]
 16

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,site6,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,2014-02-20 10:02:45,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,0
2,890,2014-02-22 11:19:50,941.0,2014-02-22 11:19:50,3847.0,2014-02-22 11:19:51,941.0,2014-02-22 11:19:51,942.0,2014-02-22 11:19:51,3846.0,2014-02-22 11:19:51,3847.0,2014-02-22 11:19:52,3846.0,2014-02-22 11:19:52,1516.0,2014-02-22 11:20:15,1518.0,2014-02-22 11:20:16,0
3,14769,2013-12-16 16:40:17,39.0,2013-12-16 16:40:18,14768.0,2013-12-16 16:40:19,14769.0,2013-12-16 16:40:19,37.0,2013-12-16 16:40:19,39.0,2013-12-16 16:40:19,14768.0,2013-12-16 16:40:20,14768.0,2013-12-16 16:40:21,14768.0,2013-12-16 16:40:22,14768.0,2013-12-16 16:40:24,0
4,782,2014-03-28 10:52:12,782.0,2014-03-28 10:52:42,782.0,2014-03-28 10:53:12,782.0,2014-03-28 10:53:42,782.0,2014-03-28 10:54:12,782.0,2014-03-28 10:54:42,782.0,2014-03-28 10:55:12,782.0,2014-03-28 10:55:42,782.0,2014-03-28 10:56:12,782.0,2014-03-28 10:56:42,0
5,22,2014-02-28 10:53:05,177.0,2014-02-28 10:55:22,175.0,2014-02-28 10:55:22,178.0,2014-02-28 10:55:23,177.0,2014-02-28 10:55:23,178.0,2014-02-28 10:55:59,175.0,2014-02-28 10:55:59,177.0,2014-02-28 10:55:59,177.0,2014-02-28 10:57:06,178.0,2014-02-28 10:57:11,0


In [4]:
def start_in_seconds(x):
    if type(x) != pd._libs.tslibs.nattype.NaTType:
        return pd.to_timedelta(str(x.time())) // np.timedelta64(1, 's')
    else:
        return -1


def feature_engineering_time(df,
                             time_col,
                             target=True,
                             time_max=66054,
                             time_min=27444
                            ):
    df_time = df[time_col].copy()
    
    print('Обработка: mounth')
    df_time['month'] = df_time[time_col[0]].progress_apply(lambda x: x.date().month)
    
    print('Обработка: weekday')
    df_time['weekday'] = df_time[time_col[0]].progress_apply(lambda x: x.weekday()) 
    
    print('Обработка: time1')
    df_time[time_col[0] + '_sec'] = df_time[time_col[0]].progress_apply(start_in_seconds)
    
    print('Обработка: Разницы')
    col_diff = []
    for i in tqdm(range(len(time_col) - 1)):
        df_time[time_col[i + 1]+'-'+time_col[i]] = (df[time_col[i + 1]] - df[time_col[i]]) // np.timedelta64(1, 's')
        col_diff.append(time_col[i + 1]+'-'+time_col[i])
    df_time[col_diff] = df_time[col_diff].fillna(0)
    df_time['session_len'] = df_time[col_diff].sum(axis=1)
    df_time['mean_session_len'] = df_time[col_diff].mean(axis=1)
    
    print('Обработка: смещения')
    df_time.loc[:,'time1_sec'] = df_time['time1_sec'] + 86399 - time_max
    df_time.loc[:,'time1_sec'] = df_time['time1_sec'].progress_apply(lambda x: x - 86399 + time_min if x > 86400 else x)
    
    df_time['month'] = df_time['month'].astype('object')
    df_time['weekday'] = df_time['weekday'].astype('object')
    
    df_time = pd.get_dummies(df_time)
    if not target:
        df_time['month_11'] = 0
        df_time['month_12'] = 0
        
    if target:
        df_time['target'] = df['target']
    
    df_time.drop(columns=col_diff, axis=1, inplace=True)
    df_time.drop(columns=time_col, axis=1, inplace=True)
    
    print('-'*50)
    print(' '*22 + 'READY')
    print('-'*50)
    
    return df_time

In [9]:
train_change = feature_engineering_time(train, time_col)
task_change = feature_engineering_time(task, time_col, target=False)

Обработка: mounth


  0%|          | 0/253561 [00:00<?, ?it/s]

Обработка: weekday


  0%|          | 0/253561 [00:00<?, ?it/s]

Обработка: time1


  0%|          | 0/253561 [00:00<?, ?it/s]

Обработка: Разницы


  0%|          | 0/9 [00:00<?, ?it/s]

Обработка: смещения


  0%|          | 0/253561 [00:00<?, ?it/s]

--------------------------------------------------
                      READY
--------------------------------------------------
Обработка: mounth


  uniques = Index(uniques)


  0%|          | 0/82797 [00:00<?, ?it/s]

Обработка: weekday


  0%|          | 0/82797 [00:00<?, ?it/s]

Обработка: time1


  0%|          | 0/82797 [00:00<?, ?it/s]

Обработка: Разницы


  0%|          | 0/9 [00:00<?, ?it/s]

Обработка: смещения


  0%|          | 0/82797 [00:00<?, ?it/s]

--------------------------------------------------
                      READY
--------------------------------------------------


  uniques = Index(uniques)


In [7]:
train_change.head()

Unnamed: 0_level_0,time1_sec,session_len,mean_session_len,month_1,month_2,month_3,month_4,month_5,month_11,month_12,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,56510,0.0,0.0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,61135,26.0,2.888889,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
3,80362,7.0,0.777778,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0
4,59477,270.0,30.0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
5,59530,246.0,27.333333,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0


In [8]:
task_change.head()

Unnamed: 0_level_0,time1_sec,session_len,mean_session_len,month_1,month_2,month_3,month_4,month_5,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,month_11,month_12
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,61138,7.0,0.777778,0,0,0,1,0,0,0,0,1,0,0,0,0,0
2,59973,85.0,9.444444,0,0,1,0,0,0,0,0,0,1,0,0,0,0
3,77657,84.0,9.333333,0,0,0,0,1,1,0,0,0,0,0,0,0,0
4,56544,4.0,0.444444,0,0,0,1,0,0,0,0,0,1,0,0,0,0
5,74676,13.0,1.444444,0,0,0,0,1,0,0,0,0,1,0,0,0,0


In [10]:
def get_auc_lr_valid_my(x, y, C=1.0, seed=42, ratio = 0.9, scaler=True):
    
    if scaler:
        ss = StandardScaler()
        x = ss.fit_transform(x)
    # Split the data into the training and validation sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=(1 - ratio),  random_state=seed)
    
    # Classifier training
    lr = LogisticRegression(C=C, random_state=seed, solver='lbfgs', max_iter=500).fit(x_train, y_train)
    
    # Prediction for validation set
    y_pred = lr.predict(x_test)
    y_pred_proba = lr.predict_proba(x_test)
    
    # Calculate the quality
    print('\n'+'*'*55)
    print(f'Roc_auc proba0: {roc_auc_score(y_test, y_pred_proba[:,0])}')
    print(f'Roc_auc proba1: {roc_auc_score(y_test, y_pred_proba[:,1])}')
    print('*'*55)
    print()
    print(classification_report(y_test, y_pred))
    
    return y_pred, y_pred_proba
    

In [11]:
tmp, tmp_proba = get_auc_lr_valid_my(train_change.drop('target', axis=1), train_change['target'])
pd.Series(tmp).value_counts()


*******************************************************
Roc_auc proba0: 0.102054516167777
Roc_auc proba1: 0.897945483832223
*******************************************************

              precision    recall  f1-score   support

           0       0.99      1.00      1.00     25115
           1       0.00      0.00      0.00       242

    accuracy                           0.99     25357
   macro avg       0.50      0.50      0.50     25357
weighted avg       0.98      0.99      0.99     25357



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0    25357
dtype: int64

---

sites

In [12]:
train[sites].fillna(0).astype('int').to_csv('src/train_sessions_text.txt', 
                                sep=' ', index=None, header=None)
task[sites].fillna(0).astype('int').to_csv('src/test_sessions_text.txt', 
                                sep=' ', index=None, header=None)

In [13]:
cv = CountVectorizer(ngram_range=(1, 3), max_features=50000)
with open('src/train_sessions_text.txt') as inp_train_file:
    train_vector = cv.fit_transform(inp_train_file)
with open('src/test_sessions_text.txt') as inp_test_file:
    task_vector = cv.transform(inp_test_file)

In [15]:
tmp_cat = CatBoostClassifier(n_estimators=500, silent=True).fit(train_vector, y_train['target'])
proba_vector = tmp_cat.predict_proba(task_vector)[:, 1]

In [12]:
# lr = LogisticRegression(solver='lbfgs', max_iter=500).fit(train_vector, y_train['target'])
# proba_vector = lr.predict_proba(train_vector)

In [17]:
train_change['proba_vec'] = tmp_cat.predict_proba(train_vector)[:,1]
task_change['proba_vec'] = tmp_cat.predict_proba(task_vector)[:,1]

In [18]:
def get_auc_lr_valid_my(x, y, ratio = 0.9, scaler=True):
    
    if scaler:
        ss = StandardScaler()
        x = ss.fit_transform(x)
    
    # Split the data into the training and validation sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=(1 - ratio),  random_state=17)
    
    # Classifier training
    lr = LogisticRegression(solver='lbfgs', max_iter=500).fit(x_train, y_train)
    
    # Prediction for validation set
    y_pred = lr.predict(x_test)
    y_pred_proba = lr.predict_proba(x_test)
    
    # Calculate the quality
    print('\n'+'*'*55)
    print(f'Roc_auc proba0: {roc_auc_score(y_test, y_pred_proba[:,0])}')
    print(f'Roc_auc proba1: {roc_auc_score(y_test, y_pred_proba[:,1])}')
    print('*'*55)
    print()
    print(classification_report(y_test, y_pred))
    
    return y_pred, y_pred_proba
    

In [19]:
tmp, tmp_proba = get_auc_lr_valid_my(train_change.drop('target', axis=1), train_change['target'])


*******************************************************
Roc_auc proba0: 0.02285204041796858
Roc_auc proba1: 0.9771479595820314
*******************************************************

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25141
           1       0.93      0.70      0.80       216

    accuracy                           1.00     25357
   macro avg       0.96      0.85      0.90     25357
weighted avg       1.00      1.00      1.00     25357



In [20]:
train_change.head()

Unnamed: 0_level_0,time1_sec,session_len,mean_session_len,month_1,month_2,month_3,month_4,month_5,month_11,month_12,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,target,proba_vec
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,56510,0.0,0.0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0.002425
2,61135,26.0,2.888889,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0.012271
3,80362,7.0,0.777778,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0.002406
4,59477,270.0,30.0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0.002425
5,59530,246.0,27.333333,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0.003126


In [16]:
# def pred_task(df_train, y, df_task):
#     sc = StandardScaler()
#     train = sc.fit_transform(df_train)
#     tasc = sc.transform(df_task)
#     lr = LogisticRegression(solver='lbfgs', max_iter=500).fit(df_train, y)
#     y_pred = lr.predict_proba(df_task)[:, 1]
    
#     return y_pred

In [17]:
# y_pred = pred_task(train_change.drop('target', axis=1), y_train['target'], task_change)
# predicted_df = pd.DataFrame({'target':y_pred}, index=task_change.index)
# predicted_df

Unnamed: 0_level_0,target
session_id,Unnamed: 1_level_1
1,0.015815
2,0.011482
3,0.003559
4,0.021717
5,0.006252
...,...
82793,0.003068
82794,0.006650
82795,0.015608
82796,0.020582


In [18]:
# predicted_df.to_csv('answers/baseline_4.csv')

In [21]:
def pred_task_cat(df_train, y, df_task):
    sc = StandardScaler()
    train = sc.fit_transform(df_train)
    tasc = sc.transform(df_task)
    Cat = CatBoostClassifier(l2_leaf_reg=5, max_depth=5, n_estimators=800, silent=True).fit(df_train, y)
    y_pred = Cat.predict_proba(df_task)[:, 1]
    
    return y_pred

In [22]:
y_pred = pred_task_cat(train_change.drop('target', axis=1), y_train['target'], task_change)
predicted_df = pd.DataFrame({'target':y_pred}, index=task_change.index)
predicted_df.to_csv('answers/baseline_8.csv')
predicted_df

Feature names must be in the same order as they were in fit.



Unnamed: 0_level_0,target
session_id,Unnamed: 1_level_1
1,1.534393e-05
2,2.325541e-06
3,1.700217e-04
4,8.507396e-07
5,1.052971e-05
...,...
82793,1.051024e-01
82794,1.820356e-05
82795,9.173678e-05
82796,8.483413e-06


In [23]:
def get_auc_lr_valid_cat(x, y, ratio = 0.9, scaler=True):
    
    if scaler:
        ss = StandardScaler()
        x = ss.fit_transform(x)
    
    # Split the data into the training and validation sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=(1 - ratio),  random_state=17)
    
    # Classifier training
    Cat = CatBoostClassifier(l2_leaf_reg=5, max_depth=5, n_estimators=800, silent=True).fit(x_train, y_train)
    
    # Prediction for validation set
    y_pred = Cat.predict(x_test)
    y_pred_proba = Cat.predict_proba(x_test)
    
    # Calculate the quality
    print('\n'+'*'*55)
    print(f'Roc_auc proba0: {roc_auc_score(y_test, y_pred_proba[:,0])}')
    print(f'Roc_auc proba1: {roc_auc_score(y_test, y_pred_proba[:,1])}')
    print('*'*55)
    print()
    print(classification_report(y_test, y_pred))
    

In [24]:
get_auc_lr_valid_cat(train_change.drop('target', axis=1), y_train['target'])


*******************************************************
Roc_auc proba0: 0.0051126461571551225
Roc_auc proba1: 0.9948873538428449
*******************************************************

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25141
           1       0.94      0.84      0.89       216

    accuracy                           1.00     25357
   macro avg       0.97      0.92      0.94     25357
weighted avg       1.00      1.00      1.00     25357



In [None]:
# *******************************************************
# Roc_auc proba0: 0.0012768725131001926
# Roc_auc proba1: 0.9987231274868998
# *******************************************************

#               precision    recall  f1-score   support

#            0       1.00      1.00      1.00     25141
#            1       0.90      0.94      0.92       216

#     accuracy                           1.00     25357
#    macro avg       0.95      0.97      0.96     25357
# weighted avg       1.00      1.00      1.00     25357

In [37]:
# params = {'learning_rate': [0.00001, 0.0001, 0.001, 0.01, 0.1],
#           'depth': [6, 7, 8, 10, 12, 14, 16, 18, 20, 22, 24],
#           'l2_leaf_reg': [1, 3, 5, 7, 9],
#           'rsm' : [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1],
#           'silent' : [True]
#           }

params = {'n_estimators' : [300, 500, 800],
          'max_depth': [2, 3, 5],
          'l2_leaf_reg': [3, 5, 8],
          'silent' : [True]
         }

Cat = CatBoostClassifier()
grid = GridSearchCV(Cat, params, cv=5)
grid.fit(train_change.drop('target', axis=1), y_train['target'])
grid.best_params_

{'l2_leaf_reg': 5, 'max_depth': 5, 'n_estimators': 800, 'silent': True}

---