In [None]:
# Import all dependencies
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read csv file
df=pd.read_csv('archive/dataset_phishing.csv')

# Show all columns 
pd.set_option('display.max_columns', None)

## Before we begin exploring the data, let's quickly analyze some important factors of the data

### Let's label legitimate = 1 and phishing = 0

In [None]:
# Re-run above code to run this cell
# Encoding 'status' as label 1 & 0, naming the field as target
df['status'] = df['status'].str.contains('legitimate').astype(int)
df

#### The following code will verify that there are exactly 50% phishing URLs and 50% legitimate URLs - it will separate only the status and URL in one csv file

In [None]:
function_df = df[['url','status']]
function_df.head()

### We save this CSV to be our function.csv file

In [None]:
function_df.to_csv("archive/function.csv",index=False,encoding='UTF-8')

In [None]:
df.head()

In [None]:
legitimate_list = list(df[df.status == 1].url) 

In [None]:
phishing_list = list(df[df.status == 0].url) 

### We create a Dataframe below that separates the Phishing URLs and puts them in one column; then puts all the Legitimate URLs and puts them in another column

In [None]:
pd.DataFrame({'Phishing':phishing_list, 'Legitimate':legitimate_list})

### We are dropping the 'url' column so all of our data are numeric

In [None]:
# Drop url column
df = df.drop(columns = ['url'])
df

## Save the dataset for ML use, comes later in the code

In [None]:
df.to_csv("archive/dataset_phishing_functionapplied.csv",index=False,encoding='UTF-8')

# Exploring the dataset

In [None]:
# Print the number of rows and columns
print("Number of rows: ",df.shape[0]) 
print("Number of columns: ", df.shape[1])

In [None]:
# printing the first 5 rows.
df.head()

In [None]:
#getting info about columns
df.info()

In [None]:
# Looking for null values
df.isna().sum().sort_values(ascending=False)

### Observation: The dataset has no missing values. All columns are numeric and either integers or float. Result: No cleaning of missing data needed and data types look correct.

### Looking for Duplicated Values

In [None]:
# Identifying duplicate columns

duplicates = df.duplicated().sort_values(ascending=False)
print(duplicates)

In [None]:
# Get duplicate rows

duplicates = df.duplicated()
df[duplicates]
print("Number of duplicated rows: ", df[duplicates].shape[0])

In [None]:
# Printing duplicates values 

df[duplicates].to_excel(r'archive/duplicates.xlsx')

In [None]:
# Deleting all duplicated values

cleaned_data= df.drop_duplicates()

In [None]:
# Checking again if any duplicate values still exist.
check = cleaned_data.duplicated()
cleaned_data[check]

### Observation: The dataset contained 174 duplicated rows. All duplicates were removed from the source dataset. The new dataframe without duplicate values is called "cleaned_data".

### Exploring summary statistics:

In [None]:
#Show statistics 

cleaned_data.describe()

### Observation: 
length_url, length_hostname, nb_dots, nb_hyphens, nb_and, nb_eq,nb_underscore,nb_percent, nb_slash, nb_semicolumn, nb_space, length_words_raw, char_repeat, shortest_words_raw, shortest_word_host, shortest_word_path,	longest_words_raw, longest_word_host, longest_word_path, avg_words_raw,	avg_word_host, avg_word_pat, phish_hints, np_hyperlinks, nb_extCSS, domain_registration_length, page_rank,have high max_values relative to the percentiles. 
domain_registration_length and domain_age have negative values.


### Exploratory Data Analysis

#### Analyzing length_url

In [None]:
# Visualizing outliers with a Boxplot

URL_length = cleaned_data['length_url']
sns.boxplot(data=URL_length)
plt.show()

In [None]:
# Visualizing distribution with a Violinplot

URL_length = cleaned_data['length_url']
sns.violinplot(data=URL_length)
plt.show()

In [None]:
# Visualizing the distribution of length_url


sns.histplot(data=cleaned_data['length_url'],bins=20)
plt.title('Distribution of length_url')
plt.show()

In [None]:
# Finding outliers

from scipy.stats import iqr
iqr = iqr(cleaned_data['length_url'])
lower = np.quantile(cleaned_data['length_url'], 0.25) - 1.5 * iqr
upper = np.quantile(cleaned_data['length_url'], 0.75) + 1.5 * iqr


print(lower)
print(upper)

#showing outliers (3 in total)
outliers= cleaned_data[((cleaned_data['length_url'] < lower)) | (cleaned_data['length_url'] > upper)]

outliers

In [None]:
print('There are ', outliers.shape[0], 'outliers for length_url in the dataset')

#### Observation: Some URLs have a length over 1000.  The histogram shows that the vast majority of values are in the low end of length. Based on a statistical formula, all values above a length of 128 are considered outliers. Further discussion needed to keep them or not.

#### Analyzing length hostname:

In [None]:
# Visualizing outliers with a Boxplot

hostname_length = cleaned_data['length_hostname']
sns.boxplot(data= hostname_length)
plt.show()

In [None]:
# Visualizing the distribution of length_hostname


sns.histplot(data=cleaned_data['length_hostname'],bins=20)
plt.title('Distribution of length_hostname')
plt.show()

#### Analyzing  nb_dots:

In [None]:
# Visualizing outliers with a Boxplot

hostname_length = cleaned_data['nb_dots']
sns.boxplot(data= hostname_length)
plt.show()

In [None]:
# Visualizing the distribution of nb_dots

sns.histplot(data=cleaned_data['nb_dots'],bins=20)
plt.title('Distribution of nb_dots')
plt.show()

#### Analyzing longest_words_raw

In [None]:
# Visualizing outliers with a Boxplot

hostname_length = cleaned_data['longest_words_raw']
sns.boxplot(data= hostname_length)
plt.show()

In [None]:
# Visualizing the distribution of  longest_words_raw

sns.histplot(data=cleaned_data['longest_words_raw'],bins=20)
plt.title('Distribution of longest_words_raw')
plt.show()

#### Observation: As for length_url there are also some rows with outliers for longest_words_raw, nb_dots and length_hostname.

In [None]:
# Exporting example of outliers for url_length to Excel
# outliers.to_excel(r'outliers_url_length.xlsx')

In [None]:
# Exporting cleaned data to Excel

# cleaned_data.to_csv(r'phishing_dataset_cleaned.csv')

# Machine Learning Models

In [None]:
# Import our dependencies
import sklearn.linear_model as lm
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report
import pandas as pd
import tensorflow as tf

In [None]:
# Import our input dataset
original_df = pd.read_csv('archive/dataset_phishing_functionapplied.csv')
original_df.head()

#### Let's checkout the columns in the dataset

In [None]:
original_df.columns

In [None]:
# Show all data types to make sure all data types are numeric and no categorical values are in the dataset
original_df.dtypes.tolist()

In [None]:
# Check again that all columns are numeric
original_df.info()

### Separating and assigning features to X & y

In [None]:
# Separating & assigning features and target columns to X & y
y = original_df['status']
X = original_df.drop('status',axis=1)
X.shape, y.shape

In [None]:
columns = X.columns
columns

In [None]:
X

In [None]:
# This should be the status of the urls' in where legitimate = 1 and phishing = 0
y

In [None]:
# Shuffle the rows to prepare for splitting and training the test set

# Shuffling the rows in the dataset so that when splitting the train and test set are equally distributed
original_df = original_df.sample(frac=1).reset_index(drop=True)
original_df.head(10)

In [None]:
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=50, 
                                                    random_state=78, 
                                                    stratify=y) 
# random_state was originally 42

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# examine the result of StandardScaler
X_train_scaled

# Random Forest Classifier

Random Forest classifier is a type of ensemble learning model combines multiple smaller models into a more robust and accurate model. Random Forest Models use a number of weak learner algorithms (such as decision trees) and combines their output to make a final classification decision. They are very similar to their neural networks counterparts. Random forest models are the most commonly used model because of their robustness and scalability. Both output and feature selection of random forest models are easy to interpret and they can easily handle outliers and nonlinear data.

Random forest algorithms are very beneficial because they:
- are robust against overfitting as all of those weak learners are trained on different pieces of the data
- can be used to rank the importance of input values in a natural way.
- can handle thousands of input variables without variable deletion.
- are robust to outliers and nonlinear data. 
- run efficiently using large datasets.

##### n=128 estimators is the largest value of estimators we would use in a model. To create our random forest classifier model and test the performance, the following code is required:

In [None]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, 
                                  random_state=78)

# max_depth, min_samples_split, max_features

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

## Observation: The test size of the data was separated 50% and it gave us a higher accuracy this way. 98% compared to 96%. See below for the random forest classifier accuracy.

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
# Display confusion matrix
display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rf_model.classes_)
display.plot()
# plt.savefig("archive/rfmodel_confusion_matrix_best.png")
plt.show()

### Confusion Matrix Analysis:

The real risk (downside) is when a phishing url is labeled as legitimate. That is the top right quadrant of the image above. This means it has created a false positive.

There is also opportunity loss when a legitimate url is labeled as phishing. That is the bottom left quadrant of the image above. This means it has created a false negative.

Top left quadrant (or the True Positive) are urls are that actually phishing. Bottom right quadrant (or the True Negative) are urls that actually legitimate.

### What makes a good Confusion Matrix?

Good Confusion Matrix = FP < FN.

What does this mean?

This would mean that the user would rather have Legitimate URLs that are accidentally labeled as Phishing (Higher FN) & there would be less Phishing URLs that are accidentally labeled as Legitimate (Lower FP).

In [None]:
# print a classification report
print(classification_report(y_test, y_pred))

In [None]:
# create feature importances from model
importances = rf_model.feature_importances_

# get column names
columns = columns

# create a dataframe
feature_importances_df = pd.DataFrame({
    'feature': columns,
    'importance': importances
}) 


feature_importance = feature_importances_df.sort_values(by=['importance'], ascending=False, inplace=True)
feature_importance = feature_importances_df.reset_index(drop=True, inplace=True)
feature_importance = feature_importances_df.head(10)
feature_importance

In [None]:
# We can sort all the features by their importance.
sorted(zip(rf_model.feature_importances_, columns), reverse=True)

In [None]:
feature_importance['feature']

In [None]:
# visualize the top ten feature-importances_

feature_importance.plot.barh(x="feature", y="importance")
plt.title("Top Ten Feature Importances")
# plt.savefig("archive/rf_feature_importance.png")

## Deep Learning Model

Here we test out dataset with a deep learning model to compare with the random forest classifier

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
# this is done using the classes within the Keras module
number_input_features = X_train.shape[1]
hidden_nodes_layer1 = (number_input_features * 3)
hidden_nodes_layer2 = (number_input_features * 1)
hidden_nodes_layer3 = (number_input_features * 0.50)

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(
        units=hidden_nodes_layer1, 
        input_dim=number_input_features, 
        activation="sigmoid")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(
    units=hidden_nodes_layer2, 
    activation="sigmoid")
)

# third hidden layer
nn.add(tf.keras.layers.Dense(
    units=hidden_nodes_layer3,
    activation="sigmoid"
))

# Output layer
nn.add(tf.keras.layers.Dense(
    units=1, 
    activation="sigmoid"
))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=50)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Create a DataFrame containing training history
history_df = pd.DataFrame(fit_model.history, index=range(1,len(fit_model.history["loss"])+1))

# Plot the loss
history_df.plot(y="loss")

In [None]:
# Plot the accuracy
history_df.plot(y="accuracy")

## Model Assessment:

According to the accuracy metric of the neural network model was able to correctly classify about 98%

Loss: 0.11522267115302384, Accuracy: 0.9800000190734863

## Use a Different Activation Function

It is important to use an activation function that matches the complexity of the input data. If we wanted to rank the four most-used activation functions by data complexity and ideal use case, the order would be as follows:

- The sigmoid function values are normalized to a probability between 0 and 1, which is ideal for binary classification (like our output classification)
- The tanh function can be used for classification or regression, and it expands the range between -1 and 1.
- The ReLU function is ideal for looking at positive nonlinear input data for classification or regression.
- The Leaky ReLU function is a good alternative for nonlinear input data with many negative inputs.

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
# this is done using the classes within the Keras module
number_input_features = X_train.shape[1]
hidden_nodes_layer1 =  50
hidden_nodes_layer2 = 25
hidden_nodes_layer3 = 30

nn_new = tf.keras.models.Sequential()

# First hidden layer
nn_new.add(
    tf.keras.layers.Dense(
        units=hidden_nodes_layer1, 
        input_dim=number_input_features, 
        activation="sigmoid")
)

# Second hidden layer
nn_new.add(tf.keras.layers.Dense(
    units=hidden_nodes_layer2, 
    activation="sigmoid")
)

# third hidden layer
nn_new.add(tf.keras.layers.Dense(
    units=hidden_nodes_layer3,
    activation="sigmoid"
))

# Output layer
nn_new.add(tf.keras.layers.Dense(
    units=1, 
    activation="sigmoid"
))

# Check the structure of the model
nn_new.summary()

In [None]:
# Compile the Sequential model together and customize metrics
nn_new.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model_new = nn_new.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn_new.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Create a DataFrame containing training history
history_new_df = pd.DataFrame(fit_model_new.history, index=range(1,len(fit_model_new.history["loss"])+1))

# Plot the loss
history_new_df.plot(y="loss")

In [None]:
# Plot the accuracy
history_new_df.plot(y="accuracy")

## Model Assessment:

According to the accuracy metric of the neural network model was only able to correctly classify about 98% (huge improvement from Segment 2 submission ML code)

Loss: 0.053311817497014996, Accuracy: 0.9800000190734863

## Model Performance Comparison:

Both optimized deep learning model and the random forest model were able to predict whether or not a url is phishing with an accuracy rate above 90%. 

Although they both performed comparably, the implementation and training times were not the same. The random forest classifier was able to train using the large dataset and predict the values within a few seconds, while the deep learning model required more than a few minutes to train on 11430 data points required. 

The random forest model is able to achieve comparable accuracy on large tabular data with a lot less code and with faster performance. The decision on whether to use random forest classifier versus the deep learning model comes down to preference. 

Since our Phishing Detection dataset is tabular, the Random Forest Classifier is the recommended model based on performance, speed, explainability and simplicity of setup.

In [None]:
## Another model to try: Gradient Boosting Classifier