In [28]:
import pandas as pd

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression

# CONFIGURE

In [22]:
pd.set_option('display.max_columns', 50)

In [14]:
DATA_PATH = '../data/raw_data/phase-2/prob-1'

In [15]:
DATA_FILE = 'raw_train.parquet'

In [29]:
class LogisticRegression(LogisticRegression):

    def score(self, X, y):
        return roc_auc_score(y, self.predict(X))

# PREPARE DATA

In [23]:
df_train = pd.read_parquet(f'{DATA_PATH}/{DATA_FILE}', engine='fastparquet')
df_train.head()

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,feature11,feature12,feature13,feature14,feature15,feature16,feature17,feature18,feature19,feature20,feature21,feature22,feature23,feature24,feature25,feature26,feature27,feature28,feature29,feature30,feature31,feature32,feature33,feature34,feature35,feature36,feature37,feature38,feature39,feature40,feature41,label
0,0.041847,tcp,-,FIN,38.0,40.0,2438.0,19266.0,31.0,29.0,453843.8,3591177.0,7.0,13.0,1.153722,1.05841,68.764188,66.421092,255.0,3898436000.0,1827204000.0,255.0,0.000707,0.000566,0.000141,64.0,482.0,0.0,0.0,6.0,0.0,5.0,1.0,1.0,1.0,0.0,0.0,0.0,2.0,11.0,0.0,0
1,1.089133,tcp,http,FIN,14.0,18.0,1684.0,10168.0,31.0,29.0,11488.04,70544.18,3.0,5.0,83.751772,64.035706,9346.43482,8182.385202,255.0,3051186000.0,906785200.0,255.0,0.000665,0.000523,0.000142,120.0,565.0,1.0,3924.0,1.0,0.0,2.0,1.0,1.0,2.0,0.0,0.0,1.0,2.0,1.0,0.0,0
2,2e-06,udp,dns,INT,2.0,0.0,114.0,0.0,254.0,0.0,228000000.0,0.0,0.0,0.0,0.002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57.0,0.0,0.0,0.0,25.0,2.0,18.0,17.0,17.0,25.0,0.0,0.0,0.0,17.0,25.0,0.0,1
3,1.467246,tcp,ftp,FIN,12.0,12.0,2618.0,682.0,254.0,252.0,13085.74,3413.197,3.0,4.0,133.386003,124.152453,7744.976658,198.329344,255.0,2477915000.0,1653923000.0,255.0,0.173821,0.101319,0.072502,218.0,57.0,0.0,0.0,3.0,1.0,1.0,1.0,1.0,3.0,0.0,0.0,0.0,2.0,3.0,0.0,1
4,0.000927,udp,dns,CON,2.0,2.0,130.0,162.0,31.0,29.0,560949.3,699029.1,0.0,0.0,0.002,0.003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,65.0,81.0,0.0,0.0,1.0,0.0,3.0,1.0,1.0,2.0,0.0,0.0,0.0,1.0,4.0,0.0,0


In [18]:
print('Missing values in train data:', df_train.isnull().sum().sum())

Missing values in train data: 0


In [21]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61841 entries, 0 to 61840
Data columns (total 42 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   feature1   61841 non-null  float64
 1   feature2   61841 non-null  object 
 2   feature3   61841 non-null  object 
 3   feature4   61841 non-null  object 
 4   feature5   61841 non-null  float64
 5   feature6   61841 non-null  float64
 6   feature7   61841 non-null  float64
 7   feature8   61841 non-null  float64
 8   feature9   61841 non-null  float64
 9   feature10  61841 non-null  float64
 10  feature11  61841 non-null  float64
 11  feature12  61841 non-null  float64
 12  feature13  61841 non-null  float64
 13  feature14  61841 non-null  float64
 14  feature15  61841 non-null  float64
 15  feature16  61841 non-null  float64
 16  feature17  61841 non-null  float64
 17  feature18  61841 non-null  float64
 18  feature19  61841 non-null  float64
 19  feature20  61841 non-null  float64
 20  featur

# TRAIN TEST SPLIT

In [24]:
X = df_train.copy()
y = X.pop('label')

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    test_size=0.2, random_state=42)

# Baseline model

In [50]:
kf = KFold(n_splits=5)
oof_preds = pd.Series(index=X_train.index, dtype='float64')

for n_fold, (train_index, test_index) in enumerate(kf.split(X_train, y_train)):
    X_train_kf, X_val_kf = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_kf, y_val_kf = y_train.iloc[train_index], y_train.iloc[test_index]

    # Select only numerical features
    X_train_kf = X_train_kf.select_dtypes(include='number')
    X_val_kf = X_val_kf.select_dtypes(include='number')

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_kf, y_train_kf)

    oof_preds.iloc[test_index] = model.predict(X_val_kf)

print('Baseline score:', roc_auc_score(y_train, oof_preds))

Baseline score: 0.7455900300638921
