# Data Preprocessing

## Import Libraries

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np

In [4]:
# add the column labels
columns = (['duration'
,'protocol_type'
,'service'
,'flag'
,'src_bytes'
,'dst_bytes'
,'land'
,'wrong_fragment'
,'urgent'
,'hot'
,'num_failed_logins'
,'logged_in'
,'num_compromised'
,'root_shell'
,'su_attempted'
,'num_root'
,'num_file_creations'
,'num_shells'
,'num_access_files'
,'num_outbound_cmds'
,'is_host_login'
,'is_guest_login'
,'count'
,'srv_count'
,'serror_rate'
,'srv_serror_rate'
,'rerror_rate'
,'srv_rerror_rate'
,'same_srv_rate'
,'diff_srv_rate'
,'srv_diff_host_rate'
,'dst_host_count'
,'dst_host_srv_count'
,'dst_host_same_srv_rate'
,'dst_host_diff_srv_rate'
,'dst_host_same_src_port_rate'
,'dst_host_srv_diff_host_rate'
,'dst_host_serror_rate'
,'dst_host_srv_serror_rate'
,'dst_host_rerror_rate'
,'dst_host_srv_rerror_rate'
,'Attack'])

## Read the dataset

In [5]:
df = pd.read_csv('/content/drive/MyDrive/MachineLearning/MTH-IDS/kddcup99/kddcup.csv',header=None,names=columns)

### First 5 rows of the dataset

In [6]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,Attack
0,0,tcp,http,SF,215,45076,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,233,2032,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,239,486,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.


In [7]:
df.shape

(4898431, 42)

In [8]:
df.drop(['num_outbound_cmds'], axis=1, inplace=True)

### Number of columns and its data types

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898431 entries, 0 to 4898430
Data columns (total 41 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   duration                     int64  
 1   protocol_type                object 
 2   service                      object 
 3   flag                         object 
 4   src_bytes                    int64  
 5   dst_bytes                    int64  
 6   land                         int64  
 7   wrong_fragment               int64  
 8   urgent                       int64  
 9   hot                          int64  
 10  num_failed_logins            int64  
 11  logged_in                    int64  
 12  num_compromised              int64  
 13  root_shell                   int64  
 14  su_attempted                 int64  
 15  num_root                     int64  
 16  num_file_creations           int64  
 17  num_shells                   int64  
 18  num_access_files             int64  
 19  

### Value Count of each attack types

In [10]:
df.Attack.value_counts()

smurf.              2807886
neptune.            1072017
normal.              972781
satan.                15892
ipsweep.              12481
portsweep.            10413
nmap.                  2316
back.                  2203
warezclient.           1020
teardrop.               979
pod.                    264
guess_passwd.            53
buffer_overflow.         30
land.                    21
warezmaster.             20
imap.                    12
rootkit.                 10
loadmodule.               9
ftp_write.                8
multihop.                 7
phf.                      4
perl.                     3
spy.                      2
Name: Attack, dtype: int64

## Label Encoding to the dataset

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
labelencoder = LabelEncoder()
df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])

  df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])


### Checking the values now

In [13]:
df.Attack.value_counts()

18    2807886
9     1072017
11     972781
17      15892
5       12481
15      10413
10       2316
0        2203
21       1020
20        979
14        264
3          53
1          30
6          21
22         20
4          12
16         10
7           9
2           8
8           7
13          4
12          3
19          2
Name: Attack, dtype: int64

## 1. Removal of columns with object data type except the Attack Class
### i.e. Flow id, Src IP and Dest IP

#### Make a copy of attack column

In [14]:
attack_column = df['Attack']

#### Select all columns with data type other than 'object'

In [15]:
numeric_cols = df.select_dtypes(exclude='object').columns.tolist()

#### Drop all 'object' columns

In [16]:
df = df[numeric_cols]

#### Add the attack column again

In [17]:
df['Attack'] = attack_column

## 2. Checking and removal of NULL or INFINITY Values

#### check if there is any null value in any column

In [18]:
df.isnull().any().any()

False

#### replacing any infinity value in any column with null values

In [19]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)

#### removing that values

In [20]:
df = df.dropna()

## 3. Sampling

### Retain the minority class from sampling

In [21]:
df_minor = df[(df['Attack']==10)|(df['Attack']==0)|(df['Attack']==21)
|(df['Attack']==20)|(df['Attack']==14)|(df['Attack']==3)
|(df['Attack']==1)|(df['Attack']==6)|(df['Attack']==22)
|(df['Attack']==4)|(df['Attack']==16)|(df['Attack']==7)|(df['Attack']==2)|(df['Attack']==8)|(df['Attack']==13)|(df['Attack']==12)
|(df['Attack']==19)]
df_major = df.drop(df_minor.index)

### Mini Batch K-means for clustering

In [22]:
X = df_major.drop(['Attack'],axis=1)
y = df_major.iloc[:, -1].values.reshape(-1,1)
y=np.ravel(y)

In [23]:
X.shape

(4891470, 37)

In [24]:
from sklearn.cluster import MiniBatchKMeans
import warnings
from sklearn.exceptions import ConvergenceWarning

# Suppress ConvergenceWarning
warnings.filterwarnings("ignore", category=FutureWarning)

n_clusters = 470  # Define the total number of clusters
batch_size = 1000  # Define the batch size

# Initialize an empty array to store all cluster labels
all_labels = []

# Split your data (X) into batches
num_samples = X.shape[0]
num_batches = int(np.ceil(num_samples / batch_size))

for batch_num in range(num_batches):
    start_idx = batch_num * batch_size
    end_idx = min((batch_num + 1) * batch_size, num_samples)

    # Extract the current batch
    batch = X[start_idx:end_idx]

    # Apply MiniBatchKMeans clustering to the current batch
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=0).fit(batch)

    # Append the cluster labels to the list
    all_labels.append(kmeans.labels_)

    # Print progress
    # if batch_num % 10 == 0:  # Adjust the interval as needed
    print(f"Processed batch {batch_num}/{num_batches}")

# After processing all batches, concatenate the labels and assign them to the DataFrame
cluster_labels = np.concatenate(all_labels)

# Assign the cluster labels to the DataFrame
df_major['klabel'] = cluster_labels


Processed batch 0/4892
Processed batch 1/4892
Processed batch 2/4892
Processed batch 3/4892
Processed batch 4/4892
Processed batch 5/4892
Processed batch 6/4892
Processed batch 7/4892
Processed batch 8/4892
Processed batch 9/4892
Processed batch 10/4892
Processed batch 11/4892
Processed batch 12/4892
Processed batch 13/4892
Processed batch 14/4892
Processed batch 15/4892
Processed batch 16/4892
Processed batch 17/4892
Processed batch 18/4892
Processed batch 19/4892
Processed batch 20/4892
Processed batch 21/4892
Processed batch 22/4892
Processed batch 23/4892
Processed batch 24/4892
Processed batch 25/4892
Processed batch 26/4892
Processed batch 27/4892
Processed batch 28/4892
Processed batch 29/4892
Processed batch 30/4892
Processed batch 31/4892
Processed batch 32/4892
Processed batch 33/4892
Processed batch 34/4892
Processed batch 35/4892
Processed batch 36/4892
Processed batch 37/4892
Processed batch 38/4892
Processed batch 39/4892
Processed batch 40/4892
Processed batch 41/4892
Pr

In [25]:
df_major.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,Attack,klabel
0,0,215,45076,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11,203
1,0,162,4528,0,0,0,0,0,1,0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,11,423
2,0,236,1228,0,0,0,0,0,1,0,...,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,11,360
3,0,233,2032,0,0,0,0,0,1,0,...,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,11,298
4,0,239,486,0,0,0,0,0,1,0,...,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,11,382


In [26]:
cols = list(df_major)
cols.insert(41, cols.pop(cols.index('Attack')))
df_major = df_major.loc[:, cols]

In [27]:
df_major.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,klabel,Attack
0,0,215,45076,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,203,11
1,0,162,4528,0,0,0,0,0,1,0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,423,11
2,0,236,1228,0,0,0,0,0,1,0,...,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,360,11
3,0,233,2032,0,0,0,0,0,1,0,...,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,298,11
4,0,239,486,0,0,0,0,0,1,0,...,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,382,11


#### Shifting Attack column to the rightmost

### Apply Sampling

In [28]:
# Define a function to apply the sampling within groups
def typicalSampling(group):
    name = group.name
    frac = 0.018
    return group.sample(frac=frac)

# Initialize an empty DataFrame to store the results
sampled_df = pd.DataFrame()

# Get unique 'klabel' values
unique_labels = df_major['klabel'].unique()

# Define the chunk size
chunk_size = 100

# Process data in chunks
for i in range(0, len(unique_labels), chunk_size):
    # Select a chunk of 'klabel' values
    labels_chunk = unique_labels[i:i + chunk_size]

    # Filter the DataFrame to include only the selected 'klabel' values
    filtered_data = df_major[df_major['klabel'].isin(labels_chunk)]

    # Apply typicalSampling to the filtered data
    sampled_chunk = filtered_data.groupby('klabel').apply(typicalSampling)

    # Append the sampled chunk to the result DataFrame
    sampled_df = pd.concat([sampled_df, sampled_chunk])

    # Print progress information
    print(f"Processed {i + chunk_size} out of {len(unique_labels)} labels")


Processed 100 out of 470 labels
Processed 200 out of 470 labels
Processed 300 out of 470 labels
Processed 400 out of 470 labels
Processed 500 out of 470 labels


In [29]:
sampled_df['Attack'].value_counts()

18    50360
9     19621
11    17386
17      284
5       219
15      174
Name: Attack, dtype: int64

In [30]:
result = sampled_df

In [31]:
result = result.drop(['klabel'], axis=1)
result = pd.concat([result, df_minor], ignore_index=True)

In [32]:
result['Attack'].value_counts()

18    50360
9     19621
11    17386
10     2316
0      2203
21     1020
20      979
17      284
14      264
5       219
15      174
3        53
1        30
6        21
22       20
4        12
16       10
7         9
2         8
8         7
13        4
12        3
19        2
Name: Attack, dtype: int64

In [33]:
result.shape

(95005, 38)

In [34]:
result.to_csv('/content/drive/MyDrive/MachineLearning/MTH-IDS/kddcup99/kddcup99_sampledataset.csv',index=0)