### In this notebook we will be learning every features and the information they provide ! 

In [119]:
import pandas as pd 

In [120]:
columns = [
    'duration','protocol_type','service','flag','src_bytes','dst_bytes',
    'land','wrong_fragment','urgent','hot','num_failed_logins','logged_in',
    'num_compromised','root_shell','su_attempted','num_root',
    'num_file_creations','num_shells','num_access_files','num_outbound_cmds',
    'is_host_login','is_guest_login','count','srv_count','serror_rate',
    'srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate',
    'diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count',
    'dst_host_same_srv_rate','dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate','dst_host_srv_diff_host_rate',
    'dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate',
    'dst_host_srv_rerror_rate','attack','level'
]

# Specify sep=',' explicitly
training_data = pd.read_csv('../Datasets/KDDTrain+.txt', names=columns, header = None)
testing_data = pd.read_csv('../Datasets/KDDTest+.txt', names=columns, header = None)
data = pd.concat([training_data, testing_data], ignore_index= True)
data.head(5)



Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,level
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [121]:
print(data.columns)

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'attack', 'level'],
      dtype='object')


### Duration: Duration is the time taken for the connections in seconds. 

In [122]:
data['duration'].nunique()

### There are 3424 unique values of data['duration']


3424

### Protocol Types: 
### Protocol types are the types of rules used when transferring the data from one device to the other. 

### Protocol Types include : 
### 1. TCP: Transfer Control Protocol
### 2. UDP: User Datagram Protocol
### 3. ICMP: Internet Control Message Protocol 

### It is like the guideline for transferring the data! 

In [123]:
data = pd.get_dummies(data, columns=['protocol_type'], dtype = int)
data.head(5)

Unnamed: 0,duration,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,level,protocol_type_icmp,protocol_type_tcp,protocol_type_udp
0,0,ftp_data,SF,491,0,0,0,0,0,0,...,0.0,0.0,0.0,0.05,0.0,normal,20,0,1,0
1,0,other,SF,146,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,normal,15,0,0,1
2,0,private,S0,0,0,0,0,0,0,0,...,0.0,1.0,1.0,0.0,0.0,neptune,19,0,1,0
3,0,http,SF,232,8153,0,0,0,0,0,...,0.04,0.03,0.01,0.0,0.01,normal,21,0,1,0
4,0,http,SF,199,420,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,normal,21,0,1,0


### Service: Service is the application-level service that the conncection is trying to access
### For example Port(21) -> FTP
### Port(80) -> HTTP

In [124]:
### Here we are splitting the servies into different types and categories to preserve the information and adding less number of features as possible 

## For the Web Services
web_services = ['http', 'http_443', 'http_8001', 'http_2784']
data['Web_services'] = data['service'].isin(web_services).astype(int)

## File Services
file_services = ['ftp', 'ftp_data', 'tftp_u']
data['File_services'] = data['service'].isin(file_services).astype(int)


## Remote Services
remote_login_services = ['telnet', 'ssh', 'login', 'shell', 'exec','klogin', 'kshell', 'remote_job']
data['remote_login_service'] = data['service'].isin(remote_login_services).astype(int)

##Email Services
email_services = ['smtp', 'imap4', 'pop_2', 'pop_3']
data['email_service'] = data['service'].isin(email_services).astype(int)


##DNS/Naming Services

dns_services = ['domain', 'domain_u', 'name', 'hostnames']
data['dns_service'] = data['service'].isin(dns_services).astype(int)

## ICMP based services

icmp_services = ['eco_i', 'ecr_i', 'urp_i', 'tim_i', 'red_i']
data['icmp_service'] = data['service'].isin(icmp_services).astype(int)

## NETBIOS Services
netbios_services = ['netbios_ns', 'netbios_dgm', 'netbios_ssn']
data['netbios_service'] = data['service'].isin(netbios_services).astype(int)

## DataBase Directory Services
database_services = ['sql_net', 'ldap']
data['database_service'] = data['service'].isin(database_services).astype(int)

## Time Diagnostic Services
diagnostic_services = [
    'time', 'daytime', 'ntp_u', 'echo', 'discard',
    'systat', 'netstat'
]
data['diagnostic_service'] = data['service'].isin(diagnostic_services).astype(int)


## Authentication / User Services

auth_services = ['auth', 'finger', 'ident']
data['auth_service'] = data['service'].isin(auth_services).astype(int)

## Messaging Services

messaging_services = ['IRC', 'nntp', 'nnsp', 'courier']
data['messaging_service'] = data['service'].isin(messaging_services).astype(int)


known_services = (
    web_services + file_services + remote_login_services +
    email_services + dns_services + icmp_services +
    netbios_services + database_services +
    diagnostic_services + auth_services + messaging_services
)

data['other_service'] = (~data['service'].isin(known_services)).astype(int)



## Dropping the other_services column 

data = data.drop(columns = ['service'])


### The reason to make 12 different columns instead of doing the one-hot encoding to create the 70+ columns is to make it easier for model to work as the models such as Linear SVM, Logistic Regression depends more on the linear data. Adding more features would make such model slow consequently requiring more running time. 

### Doing so will decrease the chace of overfitting, and also preserves the data. 



### Flag: It describes how the connection ended (or what happened during the connection handshake)

### Flag pretty much tells if the connection was normal, failed, or suspicious


In [125]:
### Since all of the given unique values are important we will be doing one-hot encoding here ! 

data = pd.get_dummies(data, columns=['flag'], prefix = 'flag')

### The unique values in the flags feature such as : 
### 1. SF -> Normal Connection 
### 2. S0 -> Connection Attempt, No Response
### 3. REJ -> Connection Rejected

### and many more! 

---
### Source Bytes and Destination Bytes

### Source Bytes: Number of bytes sent from the source 

### Destination Bytes: Number of bytes sent back from the destination 

---
###