In [2]:
import pandas as pd

1. Read Data

In [3]:
#read data
df = pd.read_csv('Intrusion Detection.csv')
#encode categorical data
encode_list = ['protocol_type', 'service', 'flag', 'land', 'logged_in', 'root_shell', 'su_attempted',
              'is_host_login', 'is_guest_login']
df = pd.get_dummies(df, columns= encode_list)

In [4]:
#select features and target variables 
features = ['duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'num_compromised', 'num_root',
       'num_file_creations', 'num_shells', 'num_access_files',
       'num_outbound_cmds', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate','protocol_type_icmp',
       'protocol_type_tcp', 'protocol_type_udp', 'service_IRC', 'service_X11',
       'service_auth', 'service_domain', 'service_domain_u', 'service_eco_i',
       'service_ecr_i', 'service_finger', 'service_ftp', 'service_ftp_data',
       'service_http', 'service_ntp_u', 'service_other', 'service_pop_3',
       'service_private', 'service_red_i', 'service_shell', 'service_smtp',
       'service_ssh', 'service_telnet', 'service_tftp_u', 'service_tim_i',
       'service_time', 'service_urh_i', 'service_urp_i', 'flag_OTH',
       'flag_REJ', 'flag_RSTO', 'flag_RSTR', 'flag_S0', 'flag_S1', 'flag_S2',
       'flag_S3', 'flag_SF', 'land_0', 'land_1', 'logged_in_0', 'logged_in_1',
       'root_shell_0', 'root_shell_1', 'su_attempted_0', 'su_attempted_1',
       'su_attempted_2', 'is_host_login_0', 'is_guest_login_0',
       'is_guest_login_1']

X=df[features]
y=df['Class']

2. Build a classifier and 3. Determine your model accuracy

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
#split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
#building classifier and train the model
clf= RandomForestClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)
#determine the accuracy score
print('Accuracy: %.6f'%accuracy_score(y_test, y_pred))

Accuracy: 0.999836


4. Modify data by handling class imbalance

In [6]:
# !pip install imblearn --user

In [7]:
# import SMOTE package
from collections import Counter
from imblearn.over_sampling import SMOTE 

In [8]:
type(X), type(y)

(pandas.core.frame.DataFrame, pandas.core.series.Series)

In [9]:
print('Original dataset shape {}'.format(Counter(y)))
sm = SMOTE(random_state=0)
X_res, y_res = sm.fit_sample(X, y)
print('Resampled dataset shape {}'.format(Counter(y_res)))

Original dataset shape Counter({0: 97278, 1: 30})
Resampled dataset shape Counter({0: 97278, 1: 97278})


Now, we have balanced sample, retrain the previous classifier.

In [8]:
#split dataset
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, random_state=0)
#building classifier and train the model
clf= RandomForestClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)
#determine the accuracy score
print('Accuracy: %.6f'%accuracy_score(y_test, y_pred))

Accuracy: 0.999979


Summary:
Using unbalanced samples, we found that the accuracy is quite good (0.99936), but it is most likely because the model has bias to the normal condition due to its abundancy. It's like all predictions are normal could still give high accuracy. After we have balanced by SMOTE, accuracy incraesed (0.999979). And more importantly, we are more confident with the model we trained because we have balanced sample.