In [2]:
import pandas as pd
import numpy as np

import sklearn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures, OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

import pickle
import time
from tqdm import tqdm

# CONFIGURE

In [3]:
pd.set_option('display.max_columns', 50)

In [4]:
DATA_PATH = '../data/raw_data/phase-2/prob-2'

In [5]:
DATA_FILE = 'raw_train.parquet'

# PREPARE DATA

In [6]:
df_train = pd.read_parquet(f'{DATA_PATH}/{DATA_FILE}', engine='fastparquet')
df_train.head()

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,feature11,feature12,feature13,feature14,feature15,feature16,feature17,feature18,feature19,feature20,feature21,feature22,feature23,feature24,feature25,feature26,feature27,feature28,feature29,feature30,feature31,feature32,feature33,feature34,feature35,feature36,feature37,feature38,feature39,feature40,feature41,label
0,0.041847,tcp,-,FIN,38.0,40.0,2438.0,19266.0,31.0,29.0,453843.8,3591177.0,7.0,13.0,1.153722,1.05841,68.764188,66.421092,255.0,3898436000.0,1827204000.0,255.0,0.000707,0.000566,0.000141,64.0,482.0,0.0,0.0,6.0,0.0,5.0,1.0,1.0,1.0,0.0,0.0,0.0,2.0,11.0,0.0,Normal
1,1.089133,tcp,http,FIN,14.0,18.0,1684.0,10168.0,31.0,29.0,11488.04,70544.18,3.0,5.0,83.751772,64.035706,9346.43482,8182.385202,255.0,3051186000.0,906785200.0,255.0,0.000665,0.000523,0.000142,120.0,565.0,1.0,3924.0,1.0,0.0,2.0,1.0,1.0,2.0,0.0,0.0,1.0,2.0,1.0,0.0,Normal
2,2e-06,udp,dns,INT,2.0,0.0,114.0,0.0,254.0,0.0,228000000.0,0.0,0.0,0.0,0.002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57.0,0.0,0.0,0.0,25.0,2.0,18.0,17.0,17.0,25.0,0.0,0.0,0.0,17.0,25.0,0.0,Other
3,1.467246,tcp,ftp,FIN,12.0,12.0,2618.0,682.0,254.0,252.0,13085.74,3413.197,3.0,4.0,133.386003,124.152453,7744.976658,198.329344,255.0,2477915000.0,1653923000.0,255.0,0.173821,0.101319,0.072502,218.0,57.0,0.0,0.0,3.0,1.0,1.0,1.0,1.0,3.0,0.0,0.0,0.0,2.0,3.0,0.0,Denial of Service
4,0.000927,udp,dns,CON,2.0,2.0,130.0,162.0,31.0,29.0,560949.3,699029.1,0.0,0.0,0.002,0.003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,65.0,81.0,0.0,0.0,1.0,0.0,3.0,1.0,1.0,2.0,0.0,0.0,0.0,1.0,4.0,0.0,Normal


In [7]:
print('Missing values in train data:', df_train.isnull().sum().sum())

Missing values in train data: 0


In [8]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61841 entries, 0 to 61840
Data columns (total 42 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   feature1   61841 non-null  float64
 1   feature2   61841 non-null  object 
 2   feature3   61841 non-null  object 
 3   feature4   61841 non-null  object 
 4   feature5   61841 non-null  float64
 5   feature6   61841 non-null  float64
 6   feature7   61841 non-null  float64
 7   feature8   61841 non-null  float64
 8   feature9   61841 non-null  float64
 9   feature10  61841 non-null  float64
 10  feature11  61841 non-null  float64
 11  feature12  61841 non-null  float64
 12  feature13  61841 non-null  float64
 13  feature14  61841 non-null  float64
 14  feature15  61841 non-null  float64
 15  feature16  61841 non-null  float64
 16  feature17  61841 non-null  float64
 17  feature18  61841 non-null  float64
 18  feature19  61841 non-null  float64
 19  feature20  61841 non-null  float64
 20  featur

In [9]:
df_train['label'].value_counts()

Normal                   22390
Other                    13963
Exploits                 10864
Denial of Service         9585
Information Gathering     4081
Malware                    958
Name: label, dtype: int64

# TRAIN TEST SPLIT

In [10]:
X = df_train.copy()
y = X.pop('label')

In [11]:
label_encoder = LabelEncoder()
y = pd.Series(label_encoder.fit_transform(y))

In [12]:
label_encoder.inverse_transform([4, 4])

array(['Normal', 'Normal'], dtype=object)

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y,
                                                  test_size=0.2, random_state=42)

In [40]:
kf = KFold(n_splits=5)
oof_preds = pd.Series(index=X_train.index, dtype='float64')

for n_fold, (train_index, test_index) in enumerate(kf.split(X_train, y_train)):
    X_train_kf, X_val_kf = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_kf, y_val_kf = y_train.iloc[train_index], y_train.iloc[test_index]

    # Select only numerical features
    X_train_kf = X_train_kf.select_dtypes(include='number')
    X_val_kf = X_val_kf.select_dtypes(include='number')

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_kf, y_train_kf)

    oof_preds.iloc[test_index] = model.predict(X_val_kf)

print('Baseline score:', accuracy_score(y_train, oof_preds))

Baseline score: 0.5572445019404916


In [41]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train.select_dtypes(include='number'), y_train)

print('Test score:', accuracy_score(y_val, model.predict(X_val.select_dtypes(include='number'))))

Test score: 0.5547740318538281


***Test score***: 0.56<br>
***Response time***: 1.40

# Standard Scale

In [42]:
scaler = StandardScaler()

In [43]:
kf = KFold(n_splits=5)
oof_preds = pd.Series(index=X_train.index, dtype='float64')

for n_fold, (train_index, test_index) in enumerate(kf.split(X_train, y_train)):
    X_train_kf, X_oof_kf = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_kf, y_oof_kf = y_train.iloc[train_index], y_train.iloc[test_index]

    # Select only numerical features
    X_train_kf = X_train_kf.select_dtypes(include='number')
    X_oof_kf = X_oof_kf.select_dtypes(include='number')

    X_train_kf = scaler.fit_transform(X_train_kf)
    X_oof_kf = scaler.transform(X_oof_kf)

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_kf, y_train_kf)

    oof_preds.iloc[test_index] = model.predict(X_oof_kf)

print('Baseline score:', accuracy_score(y_train, oof_preds))

Baseline score: 0.7405198900388098


In [44]:
X_train_scaled = scaler.fit_transform(X_train.select_dtypes(include='number'))
X_val_scaled = scaler.transform(X_val.select_dtypes(include='number'))

model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

print('Validation score:', accuracy_score(y_val, model.predict(X_val_scaled)))

Validation score: 0.7403185382811869


# Ordinal Encode

In [53]:
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
scaler = StandardScaler()

In [14]:
categorical_columns = ["feature2", "feature3", "feature4"]
numerical_columns = ["feature1", "feature5", "feature6", "feature7", "feature8", "feature9",
                    "feature10", "feature11", "feature12", "feature13", "feature14", "feature15",
                    "feature16", "feature17", "feature18", "feature19", "feature20", "feature21",
                    "feature22", "feature23", "feature24", "feature25", "feature26", "feature27",
                    "feature28", "feature29", "feature30", "feature31", "feature32", "feature33",
                    "feature34", "feature35", "feature36", "feature37", "feature38", "feature39",
                    "feature40", "feature41"]

In [50]:
kf = KFold(n_splits=5)
oof_preds = pd.Series(index=X_train.index, dtype='float64')

for n_fold, (train_index, test_index) in enumerate(kf.split(X_train, y_train)):
    X_train_kf, X_oof_kf = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_kf, y_oof_kf = y_train.iloc[train_index], y_train.iloc[test_index]

    X_train_kf_num = scaler.fit_transform(X_train_kf[numerical_columns])
    X_oof_kf_num = scaler.transform(X_oof_kf[numerical_columns])

    X_train_kf_cat = encoder.fit_transform(X_train_kf[categorical_columns])
    X_oof_kf_cat = encoder.transform(X_oof_kf[categorical_columns])

    model = LogisticRegression(max_iter=10000)
    model.fit(np.concatenate((X_train_kf_num, X_train_kf_cat), axis=1), y_train_kf)

    oof_preds.iloc[test_index] = model.predict(np.concatenate((X_oof_kf_num, X_oof_kf_cat), axis=1))

print('Baseline score:', accuracy_score(y_train, oof_preds))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_kf.loc[:, categorical_columns] = encoder.fit_transform(X_train_kf[categorical_columns])
  X_train_kf.loc[:, categorical_columns] = encoder.fit_transform(X_train_kf[categorical_columns])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_oof_kf.loc[:, categorical_columns] = encoder.transform(X_oof_kf[categorical_columns])
  X_oof_kf.loc[:, categorical_columns] = encoder.transform(X_oof_kf[categorical_columns])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the document

Baseline score: 0.7565895860284605


In [59]:
X_train_num = scaler.fit_transform(X_train[numerical_columns])
X_val_num = scaler.transform(X_val[numerical_columns])

X_train_cat = encoder.fit_transform(X_train[categorical_columns])
X_val_cat = encoder.transform(X_val[categorical_columns])

model = LogisticRegression(max_iter=1000)
model.fit(np.concatenate((X_train_num, X_train_cat), axis=1), y_train)

print('Validation score:', accuracy_score(y_val, model.predict(np.concatenate((X_val_num, X_val_cat), axis=1))))

Validation score: 0.7551135904276821


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Onehot

In [16]:
categorical_columns = ["feature2", "feature3", "feature4"]
numerical_columns = ["feature1", "feature5", "feature6", "feature7", "feature8", "feature9",
                    "feature10", "feature11", "feature12", "feature13", "feature14", "feature15",
                    "feature16", "feature17", "feature18", "feature19", "feature20", "feature21",
                    "feature22", "feature23", "feature24", "feature25", "feature26", "feature27",
                    "feature28", "feature29", "feature30", "feature31", "feature32", "feature33",
                    "feature34", "feature35", "feature36", "feature37", "feature38", "feature39",
                    "feature40", "feature41"]

In [16]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
scaler = StandardScaler()

In [18]:
kf = KFold(n_splits=5)
oof_preds = pd.Series(index=X_train.index, dtype='float64')

for n_fold, (train_index, test_index) in enumerate(kf.split(X_train, y_train)):
    X_train_kf, X_oof_kf = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_kf, y_oof_kf = y_train.iloc[train_index], y_train.iloc[test_index]

    X_train_cat_kf = encoder.fit_transform(X_train_kf[categorical_columns])
    X_oof_cat_kf = encoder.transform(X_oof_kf[categorical_columns])

    X_train_kf = scaler.fit_transform(np.concatenate([X_train_kf[numerical_columns], X_train_cat_kf], axis=1))
    X_oof_kf = scaler.transform(np.concatenate([X_oof_kf[numerical_columns], X_oof_cat_kf], axis=1))

    model = LogisticRegression(max_iter=10000)
    model.fit(X_train_kf, y_train_kf)

    oof_preds.iloc[test_index] = model.predict(X_oof_kf)

print('Baseline score:', accuracy_score(y_train, oof_preds))

Baseline score: 0.7709007115135834


In [21]:
X_train_cat = encoder.fit_transform(X_train[categorical_columns])
X_val_cat = encoder.transform(X_val[categorical_columns])

X_train_solution_3 = scaler.fit_transform(np.concatenate([X_train[numerical_columns], X_train_cat], axis=1))
X_val_solution_3 = scaler.transform(np.concatenate([X_val[numerical_columns], X_val_cat], axis=1))

model = LogisticRegression(max_iter=1000)
model.fit(X_train_solution_3, y_train)
t1 = time.time()
y_pred = model.predict(X_val_solution_3)
print('Predict time:' ,time.time() - t1)

print('Validation score:', accuracy_score(y_val, y_pred))

Predict time: 0.010011434555053711
Validation score: 0.7711213517665131


# Bla

In [27]:
models = {
    'linear model': LogisticRegression(max_iter=10000),
    'random forest': RandomForestClassifier(random_state=42),
    'xgboost': XGBClassifier(random_state=42, tree_method='gpu_hist'),
    'svm': SVC(random_state=42)
}

In [28]:
for model_name in models:
    print(f'Fitting {model_name}...')
    kf = KFold(n_splits=5)
    oof_preds = pd.Series(index=X_train.index, dtype='float64')
    for train_index, test_index in tqdm(kf.split(X_train, y_train), total=kf.get_n_splits()):
        X_train_kf, X_oof_kf = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_kf, y_oof_kf = y_train.iloc[train_index], y_train.iloc[test_index]

        X_train_cat_kf = encoder.fit_transform(X_train_kf[categorical_columns])
        X_oof_cat_kf = encoder.transform(X_oof_kf[categorical_columns])

        X_train_kf = scaler.fit_transform(np.concatenate([X_train_kf[numerical_columns], X_train_cat_kf], axis=1))
        X_oof_kf = scaler.transform(np.concatenate([X_oof_kf[numerical_columns], X_oof_cat_kf], axis=1))

        model = models[model_name]
        model.fit(X_train_kf, y_train_kf)

        oof_preds.iloc[test_index] = model.predict(X_oof_kf)
    print(f'Score for {model_name}:', accuracy_score(y_train, oof_preds))

Fitting linear model...


100%|██████████| 5/5 [01:49<00:00, 21.96s/it]


Score for linear model: 0.7709007115135834
Fitting random forest...


100%|██████████| 5/5 [00:48<00:00,  9.69s/it]


Score for random forest: 0.813106403622251
Fitting xgboost...


100%|██████████| 5/5 [00:29<00:00,  5.90s/it]


Score for xgboost: 0.8243450840879689
Fitting svm...


100%|██████████| 5/5 [17:16<00:00, 207.25s/it]

Score for svm: 0.7563268111254852





In [17]:
X_train_cat = encoder.fit_transform(X_train[categorical_columns])
X_val_cat = encoder.transform(X_val[categorical_columns])

X_train_num = scaler.fit_transform(X_train[numerical_columns])
X_val_num = scaler.transform(X_val[numerical_columns])

X_train_solution_3 = np.concatenate([X_train_num, X_train_cat], axis=1)
X_val_solution_3 = np.concatenate([X_val_num, X_val_cat], axis=1)

model = XGBClassifier(random_state=42)
model.fit(X_train_solution_3, y_train)

t1 = time.time()
y_pred = model.predict(X_val_solution_3)
print('Predict time:' ,time.time() - t1)

print('Validation score:', accuracy_score(y_val, y_pred))

Predict time: 0.029999732971191406
Validation score: 0.8270676691729323


In [18]:
pickle.dump(model, open('model2.sav', 'wb'))
pickle.dump(encoder, open('encoder2.sav', 'wb'))
pickle.dump(scaler, open('scaler2.sav', 'wb'))
pickle.dump(label_encoder, open('label_encoder2.sav', 'wb'))

In [20]:
label_encoder.classes_

array(['Denial of Service', 'Exploits', 'Information Gathering',
       'Malware', 'Normal', 'Other'], dtype=object)

In [25]:
import random

a= [random.randint(0, 5) for _ in range(200)]

In [27]:
lookup = {i: j for i, j in enumerate(label_encoder.classes_)}

In [29]:
lookup

{0: 'Denial of Service',
 1: 'Exploits',
 2: 'Information Gathering',
 3: 'Malware',
 4: 'Normal',
 5: 'Other'}

In [28]:
%%time
for i in a:
    lookup[i]

CPU times: total: 0 ns
Wall time: 0 ns


In [26]:
%%time
label_encoder.inverse_transform(a)

CPU times: total: 0 ns
Wall time: 997 µs


array(['Normal', 'Exploits', 'Exploits', 'Exploits', 'Denial of Service',
       'Other', 'Normal', 'Denial of Service', 'Denial of Service',
       'Malware', 'Malware', 'Malware', 'Normal', 'Other',
       'Denial of Service', 'Denial of Service', 'Malware', 'Normal',
       'Information Gathering', 'Denial of Service', 'Other', 'Normal',
       'Malware', 'Other', 'Other', 'Other', 'Malware', 'Normal',
       'Information Gathering', 'Exploits', 'Other', 'Exploits',
       'Denial of Service', 'Malware', 'Information Gathering', 'Malware',
       'Malware', 'Exploits', 'Information Gathering', 'Malware',
       'Denial of Service', 'Exploits', 'Other', 'Denial of Service',
       'Exploits', 'Exploits', 'Denial of Service',
       'Information Gathering', 'Denial of Service', 'Malware', 'Normal',
       'Information Gathering', 'Denial of Service', 'Exploits',
       'Malware', 'Malware', 'Malware', 'Denial of Service', 'Exploits',
       'Normal', 'Denial of Service', 'Information 