# Tools

In [204]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from scipy.stats import zscore
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# I- Data Preprocessing

## Step 1 : Loading Dataset

In [187]:
dataset = pd.DataFrame(pd.read_csv('kdd_train.csv'))
dataset

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,target
0,5,tcp,smtp,SF,2429,475,0,0,0,0,...,57,0.11,0.02,0.01,0.04,0.00,0.0,0.87,0.0,normal
1,0,udp,domain_u,SF,45,134,0,0,0,0,...,155,0.80,0.02,0.01,0.00,0.00,0.0,0.00,0.0,normal
2,0,udp,domain_u,SF,45,80,0,0,0,0,...,255,1.00,0.00,0.01,0.00,0.00,0.0,0.00,0.0,normal
3,1979,udp,other,SF,145,105,0,0,0,0,...,2,0.01,0.84,1.00,0.00,0.00,0.0,0.00,0.0,normal
4,14462,tcp,other,RSTR,1,0,0,0,0,0,...,2,0.01,0.68,1.00,0.00,0.00,0.0,1.00,1.0,portsweep
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22539,0,tcp,private,REJ,0,0,0,0,0,0,...,1,0.00,1.00,0.00,0.00,0.09,0.0,0.91,1.0,satan
22540,0,tcp,http,SF,254,2277,0,0,0,0,...,255,1.00,0.00,0.04,0.04,0.00,0.0,0.00,0.0,normal
22541,0,tcp,private,S0,0,0,0,0,0,0,...,25,0.10,0.07,0.00,0.00,1.00,1.0,0.00,0.0,neptune
22542,0,tcp,private,RSTR,0,0,0,0,0,0,...,1,0.00,0.58,0.57,0.00,0.00,0.0,0.58,1.0,portsweep


## Step 2 : Creating Dependant & Independant variable vecrtors

In [188]:
# creating the Independant variable
x = dataset.iloc[:,:-1]

#creating the Dependant variable
y = dataset.iloc[:,-1]


## Step 3 : Handling missing values

In [189]:
# let's check if there are missing values in our KDD dataset 
for column in dataset.isnull() :
    if True in dataset.isnull()[column]:
        print('there is a missing value')
        break
print('Luckily! No missing values were found, we can proceed confidently without the need to handle missing values ^-^')

Luckily! No missing values were found, we can proceed confidently without the need to handle missing values ^-^


#### Since there are no missing values in the KDD dataset, we don't have to address any missing values problem

## Step 4 : Converting categorical data to numeric 

#### Features Transfromation 

In [190]:
# selecting the categorical columns
categorical_data = x.select_dtypes(include = ['object']).columns
print(categorical_data)

Index(['protocol_type', 'service', 'flag'], dtype='object')


In [191]:
# applying the OneHotEncoder
OHE = OneHotEncoder()
transformer = ColumnTransformer(transformers = [('OneHot',OHE, categorical_data)], remainder='passthrough')
x=pd.DataFrame(transformer.fit_transform(x))

#### Target variable transformation 

In [192]:
#  Transform the target variable to retain only binary value
y = np.array(y.apply(lambda x:1 if x =='normal' else 0))

## Step 5 : Handling Outliers

In [193]:
for column in x.columns:
    # Calculate z-scores for the current variable
    z_scores = zscore(x[column])
    
    # Flag potential outliers using a threshold (e.g., z-score > 3 or < -3)
    outliers = (z_scores > 3) | (z_scores < -3)
    
    # Optionally, correct outliers automatically
    x.loc[outliers, column] = x[column].median()

## Step 6 : Scaling Data

In [194]:
scaler = StandardScaler()
x = np.array(scaler.fit_transform(x))

## Step 7 : Dimentionnality Reduction 

In [195]:
# Apply PCA
pca = PCA()
x = pca.fit_transform(x)
#
cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
num_components = np.argmax(cumulative_variance_ratio >= 0.9) + 1
# Use selected principal components in regression
x = x[:, :num_components]

## Step 8 : Spliting data into Train and Test Set

In [196]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)
print(x_train)

[[-4.9381376  -2.39605519 -0.33599303 ... -1.21490766  0.1948184
  -0.14070107]
 [ 0.6942674  -0.15552494  4.17006864 ... -1.28555347  0.16781503
  -0.25236066]
 [-2.68855258 -1.90406791  0.35254544 ... -1.82515647 -0.93400436
  -0.21547686]
 ...
 [ 1.87034757 -0.59628603  4.13561908 ...  0.5505337   0.86764801
  -0.01553987]
 [ 0.4573271   0.61885241  6.36198353 ... -3.00823474  1.04506022
  -1.30369468]
 [ 2.77110793 -0.62026949 -1.42536792 ...  0.03360618 -0.39910744
  -0.10995598]]


# II- Logistic Regression Model

## Step 1 : The significance of the model & the features 

In [197]:
# Fit logistic regression model
LogRegression = sm.Logit(y_train, x_train)

# Obtain the results summary
result = LogRegression.fit(maxiter=1000)

# Print the summary
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.250766
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                18035
Model:                          Logit   Df Residuals:                    18019
Method:                           MLE   Df Model:                           15
Date:                Sun, 21 Apr 2024   Pseudo R-squ.:                  0.6382
Time:                        19:00:23   Log-Likelihood:                -4522.6
converged:                       True   LL-Null:                       -12501.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1             1.1097      0.020     55.898      0.000       1.071       1.149
x2            -0.1245      0.

* The significance of the Model : since the p-value = 0 is less than 0.05 we can assume that  the overall logistic regression is significant.
* The significance of the variables : we can't assume that all the variables are significant predictors because some variables have p-value greater than 0.05 ( x5, x13, x16 ...)


In [198]:
print(result.pvalues)

[0.00000000e+000 3.84420762e-015 2.92210019e-070 1.50083722e-108
 2.49313711e-001 2.52982655e-057 1.51584664e-070 3.30273864e-031
 1.56254120e-041 5.43717456e-009 3.71356083e-019 1.82322729e-037
 9.81729849e-003 9.79384658e-027 1.38394267e-048 5.26735947e-001]


## Step 2 :  Making a prediction for a new unseen data

In [206]:

# 1.  logistic regression model
model = LogisticRegression()
model.fit(x_train, y_train)

# 2. Making predictions on the test data
predictions = model.predict(x_test)

# 3. Evaluating the model with F1-SCORE
f1 = f1_score(y_test, predictions)
print("F1 Score:", f1)
accuracy = accuracy_score(y_test,predictions )

print("Accuracy:", accuracy)


F1 Score: 0.8800705467372134
Accuracy: 0.8793524062985141


Our F1-SCORE and Accuracy PROVE the PERFORMANCE of our logistic regression model

## Step 3 :  dropping the non-significant variables & reevaluating the Model

####  Removing non-significant variables from the feature set

In [200]:
X_train = pd.DataFrame(x_train)
X_test = pd.DataFrame(x_test)
# Getting the indices of non-significant variables
non_significant_indices = [i for i, p_value in enumerate(result.pvalues) if p_value > 0.05]

# Getting the names of non-significant variables
non_significant_variables = X_train.columns[non_significant_indices]

# Dropping the non-significant variables from x_train
x_train_reduced = X_train.drop(columns=non_significant_variables)
x_test_reduced = X_test.drop(columns=non_significant_variables)


### Rerunning & Reevaluating the Model 

In [207]:
# 1.  logistic regression model
model = LogisticRegression()
model.fit(x_train_reduced, y_train)

# 2. Making predictions on the test data
predictions = model.predict(x_test_reduced)

# 3. Evaluating the model with F1-SCORE
f1 = f1_score(y_test, predictions)
print("The new F1 score :", f1)
accuracy = accuracy_score(y_test, predictions)

print("The new Accuracy:", accuracy)

The new F1 score : 0.8794169611307421
The new Accuracy: 0.8789088489687292


### Conclusion 

After removing non-significant variables from the logistic regression model, the performance metrics showed little change. This indicates that the remaining variables already captured the essential information for prediction.