#### Read Data

In [18]:
import pandas as pd
import requests
import gzip
import shutil
import os

os.makedirs('data', exist_ok=True)
url = "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz"
compressed_file = "data/kddcup.data.gz"
data_file = "data/kddcup.data.csv"

response = requests.get(url, stream=True)
with open(compressed_file, 'wb') as f:
    f.write(response.content)

with gzip.open(compressed_file, 'rb') as f_in:
    with open(data_file, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [20]:
import pandas as pd
column_names = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes",
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in",
    "num_compromised", "root_shell", "su_attempted", "num_root",
    "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds",
    "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate",
    "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
    "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"
]
data_file = 'data/kddcup.data.csv'
df = pd.read_csv(data_file, header=None, names=column_names)
df = df.drop('num_outbound_cmds', axis=1)
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,215,45076,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,233,2032,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,239,486,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.


#### Cleaning/Preprocessing

(The cleaning steps below are deduced from insights gained from extensive visualizations and data wrangling steps, which have been omitted from the notebook for the sake of cleanliness)

In [21]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Identify feature types
categorical_features = ['protocol_type', 'service', 'flag']
binary_features = ['land', 'logged_in', 'is_host_login', 'is_guest_login']
numeric_features = [col for col in df.columns if col not in categorical_features + binary_features + ['label']]

# Split the data into train and test sets
X = df.drop('label', axis=1)
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [22]:
# Define preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('bin', binary_transformer, binary_features)
    ]
)

In [23]:
X_train_preprocessed = preprocessor.fit_transform(X_train)

X_test_preprocessed = preprocessor.transform(X_test)

# Convert the preprocessed features back to DataFrames
# Get feature names after one-hot encoding
cat_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
all_feature_names = numeric_features + list(cat_feature_names) + binary_features

df_train_cleaned = pd.DataFrame(X_train_preprocessed, columns=all_feature_names)
df_test_cleaned = pd.DataFrame(X_test_preprocessed, columns=all_feature_names)

# Add the target variable back to the DataFrames
df_train_cleaned['label'] = y_train.values
df_test_cleaned['label'] = y_test.values

In [24]:
df_train_cleaned.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,land,logged_in,is_host_login,is_guest_login
count,3918744.0,3918744.0,3918744.0,3918744.0,3918744.0,3918744.0,3918744.0,3918744.0,3918744.0,3918744.0,...,3918744.0,3918744.0,3918744.0,3918744.0,3918744.0,3918744.0,3918744.0,3918744.0,3918744.0,3918744.0
mean,-9.793039e-18,4.487645e-20,-8.997955e-19,5.122262e-18,-4.1930019999999997e-19,-1.2940740000000001e-17,-1.38709e-19,7.819382e-20,1.095167e-18,2.137751e-18,...,0.1775684,0.0001046254,3.368426e-05,1.148327e-05,0.7643796,0.0002112922,5.35886e-06,0.1434807,5.103676e-07,0.0008359821
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.382149,0.01022812,0.005803717,0.003388679,0.424386,0.01453436,0.002314915,0.3505624,0.0007144001,0.02890127
min,-0.06681397,-0.001967301,-0.00374525,-0.01514353,-0.0009810075,-0.02651866,-0.004387833,-0.002015437,-0.008285479,-0.00459651,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.06681397,-0.001925493,-0.00374525,-0.01514353,-0.0009810075,-0.02651866,-0.004387833,-0.002015437,-0.008285479,-0.00459651,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,-0.06681397,-0.001473205,-0.00374525,-0.01514353,-0.0009810075,-0.02651866,-0.004387833,-0.002015437,-0.008285479,-0.00459651,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,-0.06681397,-0.0009867117,-0.00374525,-0.01514353,-0.0009810075,-0.02651866,-0.004387833,-0.002015437,-0.008285479,-0.00459651,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,59.34995,1311.217,1972.283,70.02059,1855.876,164.1068,661.3338,1772.033,120.6931,245.0641,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### Experimenting with Models

In [1]:
import math
for n in range(1, 10):
    print((math.factorial(n) * (n**2 + 1)), ' === ', (math.factorial(n+1) - math.factorial(n) + n*math.factorial(n)))

2  ===  2
10  ===  8
60  ===  36
408  ===  192
3120  ===  1200
26640  ===  8640
252000  ===  70560
2620800  ===  645120
29756160  ===  6531840


In [4]:
(math.sqrt(5)+1)/4

0.8090169943749475

In [4]:
math.sqrt(0.5+math.sqrt(10-2*math.sqrt(5))/8)

0.8910065241883679

In [5]:
math.sqrt(25+10*math.sqrt(5))/4

1.720477400588967

In [9]:
1/(math.sqrt(2)/2) + (math.cos(3*math.pi/20))

2.3052200865614627

In [12]:
1/(math.cos(3*math.pi/20)+math.sqrt(2)/2)

0.6257378601609235

In [None]:
np.matmul