In [2]:
import pandas as pd

# Read data from file csv

In [9]:
df = pd.read_csv('data/cybersecurity_attacks.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
Timestamp,2023-05-30 06:33:58,2020-08-26 07:08:30,2022-11-13 08:23:25,2023-07-02 10:38:46,2023-07-16 13:11:07
Source IP Address,103.216.15.12,78.199.217.198,63.79.210.48,163.42.196.10,71.166.185.76
Destination IP Address,84.9.164.252,66.191.137.154,198.219.82.17,101.228.192.255,189.243.174.238
Source Port,31225,17245,16811,20018,6131
Destination Port,17616,48166,53600,32534,26646
Protocol,ICMP,ICMP,UDP,UDP,TCP
Packet Length,503,1174,306,385,1462
Packet Type,Data,Data,Control,Data,Data
Traffic Type,HTTP,HTTP,HTTP,HTTP,DNS
Payload Data,Qui natus odio asperiores nam. Optio nobis ius...,Aperiam quos modi officiis veritatis rem. Omni...,Perferendis sapiente vitae soluta. Hic delectu...,Totam maxime beatae expedita explicabo porro l...,Odit nesciunt dolorem nisi iste iusto. Animi v...


# List columns

In [10]:
df.columns

Index(['Timestamp', 'Source IP Address', 'Destination IP Address',
       'Source Port', 'Destination Port', 'Protocol', 'Packet Length',
       'Packet Type', 'Traffic Type', 'Payload Data', 'Malware Indicators',
       'Action Taken', 'Severity Level', 'User Information',
       'Device Information', 'Network Segment', 'Geo-location Data',
       'Proxy Information', 'Firewall Logs', 'IDS/IPS Alerts', 'Log Source'],
      dtype='object')

# Number of rows and columns

In [7]:
num_rows, num_cols = df.shape
print(f'Number of rows: {num_rows}')
print(f'Number of columns: {num_cols}')

Number of rows: 40000
Number of columns: 25


# Meaning of each column
**1. Time stamp**: The time at which the network activity occurred. <br>
**2. Source IP Address**: The IP address of the device that initiated the network activity.<br>
**3. Destination IP Address**: The IP address of the device that received the network activity.<br>
**4. Source Port**: The port number used by the source IP address.<br>
**5. Destination Port**: The port number used by the destination IP address.<br>
**6. Protocol**: The communication protocol used for the network activity. (TCP, UDP, etc.)<br>
**7. Packet Length**: The size of the packet in bytes.<br>
**8. Packet Type**: Type of packet (data packet, control packet, etc.)<br>
**9. Traffic Type**: The type of traffic (web traffic, email traffic, etc.)<br>
**10. Payload Data**: The data contained in the packet.<br>
**11. Malware Indicators**: Indicators of potentially malicious activity or presence of malware.<br>
**12. Anomaly Scores**: Scores indicating deviations from expected behavior, used for anomaly detection.<br>
**13. Alerts/Warning**: Notifications or warnings generated by security systems or monitoring tools.<br>
**14. Attack Type**: Type of attack detected or suspected. (DDoS, SQL injection, etc.)<br>
**15. Attack Signature**: Specific patterns or signatures associated with known attacks.<br>
**16. Action Taken**: The response or action taken in response to the network activity.<br>
**17. Severity Level**: The level of severity associated with an alert or event. (Low, Medium, High, etc.)<br>
**18. User Information**: Information about the user involved in the network activity.<br>
**19. Device Information**: Information about the device involved in the network activity. (device type, operating system, etc.)<br>
**20. Network Segment**: The segment or subnet of the network where the activity occurred.<br>
**21. Geo-location Data**: Geographic location data associated with the IP addresses.<br>
**22. Proxy Information**: Information about proxy servers or intermediaries involved in the network activity.<br>
**23. Firewall Logs**: Logs generated by firewall devices indicating allowed or blocked traffic.<br>
**24. IDS/IPS Alerts**: Alerts generated by intrusion detection or prevention systems.<br>
**25. Log Source**: The source or origin of the log entry.(name of the logging device or system).<br>

# Check duplicated rows

In [11]:
duplicate_rows = df.duplicated().sum()
duplicate_rows

0

# Check missing/null values

In [14]:
df.isnull().sum().sort_values(ascending=False)

IDS/IPS Alerts            20050
Malware Indicators        20000
Firewall Logs             19961
Proxy Information         19851
Attack Type                   0
Geo-location Data             0
Network Segment               0
Device Information            0
User Information              0
Severity Level                0
Action Taken                  0
Attack Signature              0
Timestamp                     0
Source IP Address             0
Anomaly Scores                0
Payload Data                  0
Traffic Type                  0
Packet Type                   0
Packet Length                 0
Protocol                      0
Destination Port              0
Source Port                   0
Destination IP Address        0
Log Source                    0
dtype: int64

#### Missing values in each column
- Alert/Warning: 20067
- IDS/IPS Alerts: 20050
- Malware Indicators: 20000
- Firewall Logs: 19961
- Proxy Information: 19961

In [15]:
df.isnull().sum() / len(df) * 100

Timestamp                  0.0000
Source IP Address          0.0000
Destination IP Address     0.0000
Source Port                0.0000
Destination Port           0.0000
Protocol                   0.0000
Packet Length              0.0000
Packet Type                0.0000
Traffic Type               0.0000
Payload Data               0.0000
Malware Indicators        50.0000
Anomaly Scores             0.0000
Attack Type                0.0000
Attack Signature           0.0000
Action Taken               0.0000
Severity Level             0.0000
User Information           0.0000
Device Information         0.0000
Network Segment            0.0000
Geo-location Data          0.0000
Proxy Information         49.6275
Firewall Logs             49.9025
IDS/IPS Alerts            50.1250
Log Source                 0.0000
dtype: float64

# Check data types

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Timestamp               40000 non-null  object 
 1   Source IP Address       40000 non-null  object 
 2   Destination IP Address  40000 non-null  object 
 3   Source Port             40000 non-null  int64  
 4   Destination Port        40000 non-null  int64  
 5   Protocol                40000 non-null  object 
 6   Packet Length           40000 non-null  int64  
 7   Packet Type             40000 non-null  object 
 8   Traffic Type            40000 non-null  object 
 9   Payload Data            40000 non-null  object 
 10  Malware Indicators      20000 non-null  object 
 11  Anomaly Scores          40000 non-null  float64
 13  Attack Type             40000 non-null  object 
 14  Attack Signature        40000 non-null  object 
 15  Action Taken            40000 non-null

# Numeric columns statistics