# Imports

In [1]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

# To ensure tabular data shows
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Data preprocessing

Dataset to use: [CIC IoT Dataset 2023](https://unb.ca/cic/datasets/iotdataset-2023.html)

In [2]:
def combine_csv_files(folder_path):
    """
    Combina todos los archivos CSV de una carpeta en un Ãºnico DataFrame.

    Args:
    folder_path (str): Ruta de la carpeta que contiene los archivos CSV.

    Returns:
    pandas.DataFrame: DataFrame que contiene todos los datos combinados.
    """

    # Lista para guardar los dataframes individuales
    all_dataframes = []

    # Itera sobre cada archivo en la carpeta
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            all_dataframes.append(df)

    # Combina todos los dataframes en uno solo
    combined_dataframe = pd.concat(all_dataframes, ignore_index=True)
    
    return combined_dataframe

df = combine_csv_files('dataset/CICIoT2023')

# Exploratory Data Analysis

In [3]:
# Dataset shape
df.shape

(46686579, 47)

In [4]:
# Dataset columns
df.columns

Index(['flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number', 'psh_flag_number', 'ack_flag_number', 'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count', 'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight', 'label'], dtype='object')

In [5]:
# Dataset info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46686579 entries, 0 to 46686578
Data columns (total 47 columns):
 #   Column           Dtype  
---  ------           -----  
 0   flow_duration    float64
 1   Header_Length    float64
 2   Protocol Type    float64
 3   Duration         float64
 4   Rate             float64
 5   Srate            float64
 6   Drate            float64
 7   fin_flag_number  float64
 8   syn_flag_number  float64
 9   rst_flag_number  float64
 10  psh_flag_number  float64
 11  ack_flag_number  float64
 12  ece_flag_number  float64
 13  cwr_flag_number  float64
 14  ack_count        float64
 15  syn_count        float64
 16  fin_count        float64
 17  urg_count        float64
 18  rst_count        float64
 19  HTTP             float64
 20  HTTPS            float64
 21  DNS              float64
 22  Telnet           float64
 23  SMTP             float64
 24  SSH              float64
 25  IRC              float64
 26  TCP              float64
 27  UDP       

In [6]:
# Dataset description
df.describe()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,ack_flag_number,ece_flag_number,cwr_flag_number,ack_count,syn_count,fin_count,urg_count,rst_count,HTTP,HTTPS,DNS,Telnet,SMTP,SSH,IRC,TCP,UDP,DHCP,ARP,ICMP,IPv,LLC,Tot sum,Min,Max,AVG,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight
count,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0,46686580.0
mean,5.765449,76705.96,9.06569,66.35072,9064.057,9064.057,5.463949e-06,0.08657207,0.2073353,0.09050473,0.08775006,0.1234317,1.477941e-06,7.282607e-07,0.09054283,0.3303579,0.09907672,6.239824,38.46812,0.04823423,0.05509922,0.00013068,2.141943e-08,6.42583e-08,4.093253e-05,1.49936e-07,0.5738343,0.2119176,1.713555e-06,6.618604e-05,0.1637216,0.9998873,0.9998873,1308.323,91.60735,181.9634,124.6688,33.32481,124.6916,83182530.0,9.498489,13.12182,47.09498,30724.36,0.0964376,141.5124
std,285.0342,461331.7,8.945533,14.01919,99562.49,99562.49,0.007250766,0.281207,0.4053978,0.2869035,0.2829311,0.3289321,0.001215705,0.0008533816,0.2864314,0.6635354,0.3271164,71.85245,325.3847,0.2142608,0.2281738,0.01143079,0.0001463538,0.0002534922,0.006397723,0.0003872157,0.4945185,0.4086668,0.001309027,0.008135211,0.3700227,0.01061485,0.01061485,2613.303,139.6953,524.0309,240.9915,160.3357,241.5493,17047350.0,0.8191532,8.628579,226.7696,323710.7,0.233001,21.06831
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.0,42.0,42.0,42.0,0.0,42.0,0.0,1.0,9.165151,0.0,0.0,0.0,1.0
25%,0.0,54.0,6.0,64.0,2.091856,2.091856,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,525.0,50.0,50.0,50.0,0.0,50.0,83071570.0,9.5,10.0,0.0,0.0,0.0,141.55
50%,0.0,54.0,6.0,64.0,15.75423,15.75423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,567.0,54.0,54.0,54.0,0.0,54.0,83124520.0,9.5,10.3923,0.0,0.0,0.0,141.55
75%,0.1051381,280.555,14.33,64.0,117.3848,117.3848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,567.54,54.0,55.26,54.04973,0.3719096,54.06,83343910.0,9.5,10.39671,0.5059213,1.344216,0.08,141.55
max,394357.2,9907148.0,47.0,255.0,8388608.0,8388608.0,29.71522,1.0,1.0,1.0,1.0,1.0,1.0,1.0,7.7,12.87,248.32,4401.7,9613.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,127335.8,13583.0,49014.0,13583.0,12385.24,13583.0,167639400.0,15.0,164.8211,17551.27,154902200.0,1.0,244.6


In [7]:
# Show only non numerical columns names
df.select_dtypes(exclude='number').columns

Index(['label'], dtype='object')

In [8]:
# Use one hot encoding to transform categorical columns to numerical and concat them to the dataframe
new_labels = pd.get_dummies(df['label'])
df = pd.concat([df, new_labels], axis=1)
df = df.drop(['label'], axis=1)

In [9]:
df.columns

Index(['flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number', 'psh_flag_number', 'ack_flag_number', 'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count', 'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight', 'Backdoor_Malware', 'BenignTraffic', 'BrowserHijacking', 'CommandInjection', 'DDoS-ACK_Fragmentation', 'DDoS-HTTP_Flood', 'DDoS-ICMP_Flood', 'DDoS-ICMP_Fragmentation', 'DDoS-PSHACK_Flood', 'DDoS-RSTFINFlood', 'DDoS-SYN_Flood', 'DDoS-SlowLoris', 'DDoS-SynonymousIP_Flood', 'DDoS-TCP_Flood', 'DDoS-UDP_Flood', 'DDoS-UDP_Fragmentation', 'DNS_Spoofing', 'DictionaryBruteForce', 'DoS-HTTP_Flood', 'DoS-SYN_Flood', 'DoS-TCP_Flood', 'DoS-UDP_Flood', 'MITM-ArpSpoofing',
       'Mi

In [10]:
# Sum of null values
df.isnull().sum().sum()

0

In [11]:
# Correlation matrix
corr = df.corr(numeric_only=True)

In [12]:
# Show correlation matrix only with label columns
corr[['Backdoor_Malware', 'BenignTraffic', 'BrowserHijacking',
       'CommandInjection', 'DDoS-ACK_Fragmentation', 'DDoS-HTTP_Flood',
       'DDoS-ICMP_Flood', 'DDoS-ICMP_Fragmentation', 'DDoS-PSHACK_Flood',
       'DDoS-RSTFINFlood', 'DDoS-SYN_Flood', 'DDoS-SlowLoris',
       'DDoS-SynonymousIP_Flood', 'DDoS-TCP_Flood', 'DDoS-UDP_Flood',
       'DDoS-UDP_Fragmentation', 'DNS_Spoofing', 'DictionaryBruteForce',
       'DoS-HTTP_Flood', 'DoS-SYN_Flood', 'DoS-TCP_Flood', 'DoS-UDP_Flood',
       'MITM-ArpSpoofing', 'Mirai-greeth_flood', 'Mirai-greip_flood',
       'Mirai-udpplain', 'Recon-HostDiscovery', 'Recon-OSScan',
       'Recon-PingSweep', 'Recon-PortScan', 'SqlInjection', 'Uploading_Attack',
       'VulnerabilityScan', 'XSS'
    ]]

Unnamed: 0,Backdoor_Malware,BenignTraffic,BrowserHijacking,CommandInjection,DDoS-ACK_Fragmentation,DDoS-HTTP_Flood,DDoS-ICMP_Flood,DDoS-ICMP_Fragmentation,DDoS-PSHACK_Flood,DDoS-RSTFINFlood,DDoS-SYN_Flood,DDoS-SlowLoris,DDoS-SynonymousIP_Flood,DDoS-TCP_Flood,DDoS-UDP_Flood,DDoS-UDP_Fragmentation,DNS_Spoofing,DictionaryBruteForce,DoS-HTTP_Flood,DoS-SYN_Flood,DoS-TCP_Flood,DoS-UDP_Flood,MITM-ArpSpoofing,Mirai-greeth_flood,Mirai-greip_flood,Mirai-udpplain,Recon-HostDiscovery,Recon-OSScan,Recon-PingSweep,Recon-PortScan,SqlInjection,Uploading_Attack,VulnerabilityScan,XSS
flow_duration,0.011885,0.018318,0.003621,0.007193,-0.001459,0.002498,-0.008155,-0.001792,-0.006118,-0.006114,-0.00594,0.001954,-0.004346,-0.006344,-0.006773,-0.001218,0.083374,0.040046,0.000918,-0.001992,-0.002736,-0.0052,0.012089,-0.002934,-0.002054,-0.001412,0.025471,0.066309,0.012839,0.039608,0.017685,0.002889498,0.018811,0.009163
Header_Length,-0.000443,0.313841,0.00597,0.01544,-0.011698,-0.003827,-0.070375,-0.015459,-0.051368,-0.050557,-0.051146,0.004808,-0.047753,-0.054081,-0.040403,0.032742,0.079901,-0.001013,-0.005609,-0.034417,-0.039709,-0.034352,0.339922,-0.023442,-0.019965,0.443464,-0.005782,0.00813,-0.0005,7.3e-05,-0.001328,-0.0006973918,0.01173,-0.001024
Protocol Type,-0.000241,-0.027288,-0.001488,-0.00102,-0.02617,-0.007823,-0.383815,-0.083513,-0.106048,-0.105258,-0.105378,-0.006335,-0.09876,-0.111545,0.317674,0.064466,0.007336,-0.002034,-0.01264,-0.072101,-0.083842,0.236784,-0.003033,0.610259,0.521252,0.12047,-0.010608,-0.008138,-0.001321,-0.007234,1e-06,-3.727285e-05,-0.000747,-0.000254
Duration,0.025228,0.538323,0.029306,0.01989,-0.005191,-0.000253,-0.065908,-0.006347,-0.044601,-0.048804,-0.047524,0.026235,-0.045973,-0.051662,-0.047327,-0.002904,0.114315,0.031946,0.0019,-0.024211,-0.035791,0.005955,0.200305,-0.013426,-0.005765,-0.013641,0.141331,0.091484,0.005885,0.075444,0.017503,0.0145777,0.051074,0.024652
Rate,-0.000636,-0.011089,-0.000804,-0.000824,0.002498,-0.000449,0.011587,0.004657,-0.009823,0.002942,-0.018139,0.000389,-0.019361,-0.007579,0.031538,-0.005282,-0.004521,-0.001295,-0.001665,-0.011872,-0.005637,0.028854,-0.005828,0.003383,0.000807,-0.008812,-0.004196,-0.003284,-0.000514,-0.003042,-0.000765,-0.000404036,-0.002108,-0.000683
Srate,-0.000636,-0.011089,-0.000804,-0.000824,0.002498,-0.000449,0.011587,0.004657,-0.009823,0.002942,-0.018139,0.000389,-0.019361,-0.007579,0.031538,-0.005282,-0.004521,-0.001295,-0.001665,-0.011872,-0.005637,0.028854,-0.005828,0.003383,0.000807,-0.008812,-0.004196,-0.003284,-0.000514,-0.003042,-0.000765,-0.000404036,-0.002108,-0.000683
Drate,-6e-06,-0.000117,-8e-06,-8e-06,-5.9e-05,-1.9e-05,-0.000322,-7.5e-05,-0.000234,-0.000232,-0.000233,-1.7e-05,0.002608,-0.000246,-0.000273,-5.9e-05,-4.7e-05,-1.3e-05,-3e-05,-0.000161,-0.000186,-0.000208,-6.1e-05,-0.000111,-9.6e-05,-0.000105,-4e-05,-3.5e-05,-5e-06,-3.2e-05,-8e-06,-3.902427e-06,-2.1e-05,-7e-06
fin_flag_number,-0.002556,-0.047779,-0.003449,-0.003314,-0.024132,-0.007043,-0.131465,-0.030456,-0.095456,0.999222,-0.095001,-0.006585,-0.088963,-0.100519,-0.111481,-0.024209,-0.019049,-0.00505,-0.012084,-0.065465,-0.075844,-0.085162,-0.025046,-0.045357,-0.039382,-0.042931,-0.016535,-0.013866,-0.002121,-0.012671,-0.003263,-0.001594277,-0.008715,-0.002794
syn_flag_number,-0.004246,-0.079364,-0.00572,-0.0055,-0.040089,-0.002854,-0.218399,-0.050595,-0.158578,-0.157526,0.59996,-0.011459,0.564303,-0.166988,-0.1852,-0.040218,-0.031636,-0.008462,-0.002548,0.401722,-0.125998,-0.141476,-0.041545,-0.07535,-0.065424,-0.07132,0.00357,-0.011419,-0.003537,-0.004926,-0.005421,-0.002648523,-0.01379,-0.004642
rst_flag_number,-0.002619,-0.048934,-0.003527,-0.003396,-0.021321,-0.001915,-0.134708,-0.031203,-0.080273,0.97516,-0.093993,-0.006085,-0.091158,-0.102996,-0.114231,-0.024806,-0.019402,-0.005273,0.010697,-0.051699,-0.075047,-0.087262,-0.025616,-0.046475,-0.04035,-0.043987,0.008302,-0.001332,-0.002196,0.005227,-0.003344,-0.001633606,-0.00879,-0.002863


In [13]:
df.head(10)

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,ack_flag_number,ece_flag_number,cwr_flag_number,ack_count,syn_count,fin_count,urg_count,rst_count,HTTP,HTTPS,DNS,Telnet,SMTP,SSH,IRC,TCP,UDP,DHCP,ARP,ICMP,IPv,LLC,Tot sum,Min,Max,AVG,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,Backdoor_Malware,BenignTraffic,BrowserHijacking,CommandInjection,DDoS-ACK_Fragmentation,DDoS-HTTP_Flood,DDoS-ICMP_Flood,DDoS-ICMP_Fragmentation,DDoS-PSHACK_Flood,DDoS-RSTFINFlood,DDoS-SYN_Flood,DDoS-SlowLoris,DDoS-SynonymousIP_Flood,DDoS-TCP_Flood,DDoS-UDP_Flood,DDoS-UDP_Fragmentation,DNS_Spoofing,DictionaryBruteForce,DoS-HTTP_Flood,DoS-SYN_Flood,DoS-TCP_Flood,DoS-UDP_Flood,MITM-ArpSpoofing,Mirai-greeth_flood,Mirai-greip_flood,Mirai-udpplain,Recon-HostDiscovery,Recon-OSScan,Recon-PingSweep,Recon-PortScan,SqlInjection,Uploading_Attack,VulnerabilityScan,XSS
0,0.037456,15099.0,17.0,64.0,10001.102371,10001.102371,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,525.0,50.0,50.0,50.0,0.0,50.0,83102150.0,9.5,10.0,0.0,0.0,0.0,141.55,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0.0,54.0,6.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,567.0,54.0,54.0,54.0,0.0,54.0,83331770.0,9.5,10.392305,0.0,0.0,0.0,141.55,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.010346,9662.5,17.0,64.0,21380.056228,21380.056228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,525.0,50.0,50.0,50.0,0.0,50.0,83098790.0,9.5,10.0,0.0,0.0,0.0,141.55,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0.0,54.0,6.0,64.0,241.333973,241.333973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,567.0,54.0,54.0,54.0,0.0,54.0,82951120.0,9.5,10.392305,0.0,0.0,0.0,141.55,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0.195109,95.58,6.0,64.0,6.762174,6.762174,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.77,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,567.0,54.0,54.0,54.0,0.0,54.0,83365400.0,9.5,10.392305,0.0,0.0,0.0,141.55,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0.0,54.0,6.0,64.0,1.502265,1.502265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,567.0,54.0,54.0,54.0,0.0,54.0,83067230.0,9.5,10.392305,0.0,0.0,0.0,141.55,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0.0,54.0,6.0,64.0,60.667438,60.667438,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,567.0,54.0,54.0,54.0,0.0,54.0,83348610.0,9.5,10.392305,0.0,0.0,0.0,141.55,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0.0,54.0,6.0,64.0,163.291443,163.291443,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,567.0,54.0,54.0,54.0,0.0,54.0,83034000.0,9.5,10.392305,0.0,0.0,0.0,141.55,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0.0,0.0,1.0,64.0,2.062152,2.062152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,441.0,42.0,42.0,42.0,0.0,42.0,83149750.0,9.5,9.165151,0.0,0.0,0.0,141.55,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0.036378,1618.78,1.05,64.0,46.947385,46.947385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,1.71,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,470.44,42.0,71.44,43.510737,6.49508,56.72,83124650.0,9.5,9.291007,9.296842,2160.781828,0.02,141.55,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


TODO: 
- Improve the correlation matrix to only show the correlation between the features and the label, since the correlation between the features is too much information.
- Add more plots to show the distribution of the features and the label.
- Data is pretty much clean, but there's space to do some feature engineering.