# Data Preprocessing

## Import Libraries

In [4]:
import pandas as pd
import numpy as np

## Read the dataset

In [5]:
df = pd.read_csv('kddcup.csv') 

### First 5 rows of the dataset

In [6]:
df.head()

Unnamed: 0,0,tcp,http,SF,215,45076,0.1,0.2,0.3,0.4,...,0.17,0.00.6,0.00.7,0.00.8,0.00.9,0.00.10,0.00.11,0.00.12,0.00.13,normal.
0,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,236,1228,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,233,2032,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,239,486,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,238,1282,0,0,0,0,...,5,1.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,normal.


In [8]:
df.shape

(4898430, 42)

### Number of columns and its data types

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898430 entries, 0 to 4898429
Data columns (total 42 columns):
 #   Column   Dtype  
---  ------   -----  
 0   0        int64  
 1   tcp      object 
 2   http     object 
 3   SF       object 
 4   215      int64  
 5   45076    int64  
 6   0.1      int64  
 7   0.2      int64  
 8   0.3      int64  
 9   0.4      int64  
 10  0.5      int64  
 11  1        int64  
 12  0.6      int64  
 13  0.7      int64  
 14  0.8      int64  
 15  0.9      int64  
 16  0.10     int64  
 17  0.11     int64  
 18  0.12     int64  
 19  0.13     int64  
 20  0.14     int64  
 21  0.15     int64  
 22  1.1      int64  
 23  1.2      int64  
 24  0.00     float64
 25  0.00.1   float64
 26  0.00.2   float64
 27  0.00.3   float64
 28  1.00     float64
 29  0.00.4   float64
 30  0.00.5   float64
 31  0.16     int64  
 32  0.17     int64  
 33  0.00.6   float64
 34  0.00.7   float64
 35  0.00.8   float64
 36  0.00.9   float64
 37  0.00.10  float64
 38  0.

### Value Count of each attack types

In [5]:
df.Attack.value_counts()

Attack
Benign        2515236
xss           2149308
password       340208
injection      277696
scanning        36205
backdoor        27145
ransomware       5098
mitm              517
ddos              202
dos               145
Name: count, dtype: int64

## Label Encoding to the dataset

In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
labelencoder = LabelEncoder()
df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])

### Checking the values now

In [8]:
df.Attack.value_counts()

Attack
0    2515236
9    2149308
6     340208
4     277696
8      36205
1      27145
7       5098
5        517
2        202
3        145
Name: count, dtype: int64

## 1. Removal of columns with object data type except the Attack Class
### i.e. Flow id, Src IP and Dest IP

#### Make a copy of attack column

In [9]:
attack_column = df['Attack']

#### Select all columns with data type other than 'object'

In [10]:
numeric_cols = df.select_dtypes(exclude='object').columns.tolist()

#### Drop all 'object' columns

In [11]:
df = df[numeric_cols]

#### Add the attack column again

In [12]:
df['Attack'] = attack_column

## 2. Checking and removal of NULL or INFINITY Values

#### check if there is any null value in any column

In [13]:
df.isnull().any().any()

False

#### replacing any infinity value in any column with null values

In [16]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)

#### removing that values

In [18]:
df = df.dropna()

## 3. Sampling

### Retain the minority class from sampling

In [20]:
df_minor = df[(df['Attack']==5)|(df['Attack']==2)|(df['Attack']==3)]
df_major = df.drop(df_minor.index)

### Mini Batch K-means for clustering

In [26]:
X = df_major.drop(['Attack'],axis=1) 
y = df_major.iloc[:, -1].values.reshape(-1,1)
y=np.ravel(y)

In [41]:
X.shape

(5349719, 80)

In [45]:
from sklearn.cluster import MiniBatchKMeans
import warnings
from sklearn.exceptions import ConvergenceWarning

# Suppress ConvergenceWarning
warnings.filterwarnings("ignore", category=FutureWarning)

n_clusters = 719  # Define the total number of clusters
batch_size = 1000  # Define the batch size

# Initialize an empty array to store all cluster labels
all_labels = []

# Split your data (X) into batches
num_samples = X.shape[0]
num_batches = int(np.ceil(num_samples / batch_size))

for batch_num in range(num_batches):
    start_idx = batch_num * batch_size
    end_idx = min((batch_num + 1) * batch_size, num_samples)
    
    # Extract the current batch
    batch = X[start_idx:end_idx]
    
    # Apply MiniBatchKMeans clustering to the current batch
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=0).fit(batch)
    
    # Append the cluster labels to the list
    all_labels.append(kmeans.labels_)

    # Print progress
    # if batch_num % 10 == 0:  # Adjust the interval as needed
    print(f"Processed batch {batch_num}/{num_batches}")

# After processing all batches, concatenate the labels and assign them to the DataFrame
cluster_labels = np.concatenate(all_labels)

# Assign the cluster labels to the DataFrame
df_major['klabel'] = cluster_labels


Processed batch 0/5350
Processed batch 1/5350
Processed batch 2/5350
Processed batch 3/5350
Processed batch 4/5350
Processed batch 5/5350
Processed batch 6/5350
Processed batch 7/5350
Processed batch 8/5350
Processed batch 9/5350
Processed batch 10/5350
Processed batch 11/5350
Processed batch 12/5350
Processed batch 13/5350
Processed batch 14/5350
Processed batch 15/5350
Processed batch 16/5350
Processed batch 17/5350
Processed batch 18/5350
Processed batch 19/5350
Processed batch 20/5350
Processed batch 21/5350
Processed batch 22/5350
Processed batch 23/5350
Processed batch 24/5350
Processed batch 25/5350
Processed batch 26/5350
Processed batch 27/5350
Processed batch 28/5350
Processed batch 29/5350
Processed batch 30/5350
Processed batch 31/5350
Processed batch 32/5350
Processed batch 33/5350
Processed batch 34/5350
Processed batch 35/5350
Processed batch 36/5350
Processed batch 37/5350
Processed batch 38/5350
Processed batch 39/5350
Processed batch 40/5350
Processed batch 41/5350
Pr

In [47]:
df_major.head()

Unnamed: 0,Src Port,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Attack,klabel
0,0,0,0,47814343,5,0,0.0,0.0,0.0,0.0,...,0.0,1038036.0,1038036.0,518725600000000.0,898459000000000.0,1556177000000000.0,16573240.0,0,0,322
1,0,0,0,2033142,2,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1556177000000000.0,0.0,1556177000000000.0,1556177000000000.0,0,0,359
2,0,0,0,82877133,14,0,0.0,0.0,0.0,0.0,...,1711593.0,3942470.0,226402.0,172908500000000.0,518725600000000.0,1556177000000000.0,6036493.0,0,0,8
3,0,0,0,24359,2,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1556177000000000.0,0.0,1556177000000000.0,1556177000000000.0,0,0,359
4,0,0,0,10239351,3,0,0.0,0.0,0.0,0.0,...,0.0,4053975.0,4053975.0,778088400000000.0,1100383000000000.0,1556177000000000.0,6185376.0,0,0,593


In [49]:
# cols = list(df_major)
# cols.insert(81, cols.pop(cols.index('Attack')))
# df_major = df_major.loc[:, cols]

#### Shifting Attack column to the rightmost

### Apply Sampling

In [55]:
# Define a function to apply the sampling within groups
def typicalSampling(group):
    name = group.name
    frac = 0.008
    return group.sample(frac=frac)

# Initialize an empty DataFrame to store the results
sampled_df = pd.DataFrame()

# Get unique 'klabel' values
unique_labels = df_major['klabel'].unique()

# Define the chunk size
chunk_size = 100

# Process data in chunks
for i in range(0, len(unique_labels), chunk_size):
    # Select a chunk of 'klabel' values
    labels_chunk = unique_labels[i:i + chunk_size]
    
    # Filter the DataFrame to include only the selected 'klabel' values
    filtered_data = df_major[df_major['klabel'].isin(labels_chunk)]
    
    # Apply typicalSampling to the filtered data
    sampled_chunk = filtered_data.groupby('klabel').apply(typicalSampling)
    
    # Append the sampled chunk to the result DataFrame
    sampled_df = pd.concat([sampled_df, sampled_chunk])

    # Print progress information
    print(f"Processed {i + chunk_size} out of {len(unique_labels)} labels")


Processed 100 out of 719 labels
Processed 200 out of 719 labels
Processed 300 out of 719 labels
Processed 400 out of 719 labels
Processed 500 out of 719 labels
Processed 600 out of 719 labels
Processed 700 out of 719 labels
Processed 800 out of 719 labels


In [57]:
sampled_df['Attack'].value_counts()

Attack
0    20051
9    17292
6     2681
4     2250
8      290
1      192
7       40
Name: count, dtype: int64

In [None]:
result = result.drop(['klabel'], axis=1)
result = pd.concat([result, df_minor], ignore_index=True)

In [None]:
result['Attack'].value_counts()

In [None]:
result.shape

In [None]:
result.to_csv('CIC-TON-IOT_sampledataset.csv',index=0)