# Challenge 2: Malicious URLs

Importing required Libraries:

In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report

Reading CSV Data

In [2]:
rawdata = pd.read_csv(r"Dataset_Challenge2.csv")
rawdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1781 entries, 0 to 1780
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   URL_LENGTH                 1781 non-null   int64  
 1   NUMBER_SPECIAL_CHARACTERS  1781 non-null   int64  
 2   CONTENT_LENGTH             969 non-null    float64
 3   TCP_CONVERSATION_EXCHANGE  1781 non-null   int64  
 4   DIST_REMOTE_TCP_PORT       1781 non-null   int64  
 5   REMOTE_IPS                 1781 non-null   int64  
 6   APP_BYTES                  1781 non-null   int64  
 7   SOURCE_APP_PACKETS         1781 non-null   int64  
 8   REMOTE_APP_PACKETS         1781 non-null   int64  
 9   APP_PACKETS                1781 non-null   int64  
 10  DNS_QUERY_TIMES            1780 non-null   float64
 11  Type                       1781 non-null   int64  
dtypes: float64(2), int64(10)
memory usage: 167.1 KB


In [3]:
rawdata.head()

Unnamed: 0,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CONTENT_LENGTH,TCP_CONVERSATION_EXCHANGE,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,SOURCE_APP_PACKETS,REMOTE_APP_PACKETS,APP_PACKETS,DNS_QUERY_TIMES,Type
0,16,7,263.0,7,0,2,700,9,10,9,2.0,1
1,16,6,15087.0,17,7,4,1230,17,19,17,0.0,0
2,16,6,324.0,0,0,0,0,0,0,0,0.0,0
3,17,6,162.0,31,22,3,3812,39,37,39,8.0,0
4,17,6,124140.0,57,2,5,4278,61,62,61,4.0,0


As Content_Length has high number of Null values with very high range, and has very low correlation with other features as depicted beolw, we will drop this feature from our dataset.

In [4]:
rawdata.corr()

Unnamed: 0,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CONTENT_LENGTH,TCP_CONVERSATION_EXCHANGE,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,SOURCE_APP_PACKETS,REMOTE_APP_PACKETS,APP_PACKETS,DNS_QUERY_TIMES,Type
URL_LENGTH,1.0,0.917986,0.129745,-0.038407,-0.039839,-0.046367,-0.026446,-0.042264,-0.033779,-0.042264,-0.068582,0.162104
NUMBER_SPECIAL_CHARACTERS,0.917986,1.0,0.214225,-0.037459,-0.042619,-0.047103,-0.023914,-0.040096,-0.030597,-0.040096,-0.050048,0.280897
CONTENT_LENGTH,0.129745,0.214225,1.0,0.078451,-0.000381,0.004774,0.051202,0.074142,0.091077,0.074142,-0.045644,-0.090852
TCP_CONVERSATION_EXCHANGE,-0.038407,-0.037459,0.078451,1.0,0.555188,0.33108,0.45732,0.997796,0.990848,0.997796,0.349832,-0.040202
DIST_REMOTE_TCP_PORT,-0.039839,-0.042619,-0.000381,0.555188,1.0,0.210188,0.780238,0.558612,0.591188,0.558612,0.259942,-0.082925
REMOTE_IPS,-0.046367,-0.047103,0.004774,0.33108,0.210188,1.0,0.023126,0.361104,0.304683,0.361104,0.548189,-0.078783
APP_BYTES,-0.026446,-0.023914,0.051202,0.45732,0.780238,0.023126,1.0,0.445822,0.468999,0.445822,0.012221,-0.011262
SOURCE_APP_PACKETS,-0.042264,-0.040096,0.074142,0.997796,0.558612,0.361104,0.445822,1.0,0.989285,1.0,0.410843,-0.034414
REMOTE_APP_PACKETS,-0.033779,-0.030597,0.091077,0.990848,0.591188,0.304683,0.468999,0.989285,1.0,0.989285,0.355716,-0.032897
APP_PACKETS,-0.042264,-0.040096,0.074142,0.997796,0.558612,0.361104,0.445822,1.0,0.989285,1.0,0.410843,-0.034414


In [5]:
data = rawdata.drop(columns = ['CONTENT_LENGTH'])
data = data.fillna(0)

Now, the dataset needs to be scaled to obtain appropriate results from the algorithm. Here, StandardScaler from the Scikit-learn library is used.

In [6]:
dataset = data.drop(columns = ['Type'])
scaler = preprocessing.StandardScaler()
scaled_data = scaler.fit_transform(dataset.values)

The dataset is now split into X and y where X contains all the features and y is the target variable.

In [7]:
X = pd.DataFrame(scaled_data)
y = data['Type']

The X and y sets are further split into training and testing sets as 80% and 20% respectively.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

Next, kNN classifier algorithm is employed and tested for various number of neighbours to compare algorithm performance.

In [9]:
# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors = 2)

# Fit the classifier to the data
knn.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=2)

In [10]:
pred = knn.predict(X_test)

The performance of this algorithm can be analyzed using performance metrices from Scikit_learn library:

In [11]:
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[311   3]
 [ 19  24]]
              precision    recall  f1-score   support

           0       0.94      0.99      0.97       314
           1       0.89      0.56      0.69        43

    accuracy                           0.94       357
   macro avg       0.92      0.77      0.83       357
weighted avg       0.94      0.94      0.93       357



Thank You!