In [None]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, roc_auc_score
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [None]:
# #PHISHING WEBSITES
# url = 'https://archive.ics.uci.edu/static/public/967/data.csv'

# response = requests.get(url)

# if response.status_code == 200:
#     data = pd.read_csv(io.StringIO(response.text))
#     save_path = 'data.csv'
#     data.to_csv(save_path, index=False)
# else:
#     print(f"Error downloading CSV: {response.status_code}")

In [None]:
# def split_csv(input_file, output_dir, chunk_size):
#     if not os.path.exists(output_dir):
#         os.makedirs(output_dir)

#     with open(input_file, 'r', newline='', encoding='utf-8') as csvfile:
#         reader = csv.reader(csvfile)
#         header = next(reader) 

#         file_count = 1
#         current_chunk = 0
#         output_file = os.path.join(output_dir, f'data_{file_count}.csv')

#         for row in reader:
#             if current_chunk == 0:
#                 with open(output_file, 'w', newline='', encoding='utf-8') as out_csvfile:
#                     writer = csv.writer(out_csvfile)
#                     writer.writerow(header) 

#             with open(output_file, 'a', newline='', encoding='utf-8') as out_csvfile:
#                 writer = csv.writer(out_csvfile)
#                 writer.writerow(row)

#             current_chunk += 1
#             if current_chunk == chunk_size:
#                 current_chunk = 0
#                 file_count += 1
#                 output_file = os.path.join(output_dir, f'output_{file_count}.csv')

# input_file = 'dataset.csv'
# output_dir = 'datasets'
# chunk_size = 10000

# split_csv(input_file, output_dir, chunk_size)

In [None]:
directory = 'datasets'

dfs = []
count = 0

for filename in os.listdir(directory):
    df = pd.read_csv(os.path.join(directory, filename))
    dfs.append(df)
    count += 1

df = pd.concat(dfs, ignore_index=True)

In [None]:
df.shape

In [None]:
def analyze_and_visualize_label_distribution(df):
    print(df['label'].value_counts())
    plt.figure(figsize=(10, 5))
    df['label'].value_counts().plot(kind='bar', color='skyblue')
    plt.title('Label Distribution')
    plt.xlabel('Label')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.grid(axis='y', linestyle=':', alpha=0.7)
    plt.tight_layout()
    plt.show()
analyze_and_visualize_label_distribution(df)

In [None]:
df.describe()

In [None]:
df['label'].nunique()

In [None]:
def data_cleaning(df):
    df.columns = df.columns.str.strip()
    print("Dataset Shape: ", df.shape)
    num = df._get_numeric_data()
    num[num < 0] = 0
data_cleaning(df)

In [None]:
# label_encoder = LabelEncoder()
# label_encoded_cols = []

# for col in df.columns:
#     if df[col].dtype == 'object':
#         df[col] = label_encoder.fit_transform(df[col])
#         label_encoded_cols.append(col)

In [None]:
# data subsampling and separating features and target variables
subsample_df=df.groupby('label').apply(pd.DataFrame.sample,frac=0.1).reset_index(drop=True)
x=subsample_df.drop(['label'],axis=1)
y=subsample_df['label']
# reducing the dimensionality into 2
pca=PCA(n_components=2, random_state=0)
z=pca.fit_transform(x)
# combining the principal components and labels into a DataFrame
pca_df=pd.DataFrame()
pca_df['label']=y
pca_df['PCA 1']=z[:,0]
pca_df['PCA 2']=z[:,1]
# visualizes the data in the reduced two-dimensional space
# which allows to explore potential patterns and relationships between the features and their labels
sns.scatterplot(data=pca_df,x='PCA 1',y='PCA 2',hue='label',palette=sns.color_palette('hls',len(pca_df.label.value_counts()))).set_title("PCA Projection")

plt.legend(loc='center left',bbox_to_anchor=(1,0.5))
plt.show()

In [None]:
new_df=df.copy()

In [None]:
# creating a data frame with balanced data
size=len(df.loc[df.label==0])
print(size)
bal_df=df.groupby('label').apply(lambda x: x.sample(n=min(size,len(x))))
sns.countplot(data=bal_df,x='label')

In [None]:
# feature scaling and label conversion to int
# applying normalization since we have label encoding
X=bal_df.drop(columns='label')
y=bal_df['label'].astype('int')
X=MinMaxScaler().fit_transform(X)

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
print(X_train.shape," ",X_test.shape)
print(y_train.shape," ",y_test.shape)

In [None]:
Xn=new_df.drop(columns='label')
Xn=MinMaxScaler().fit_transform(Xn)
yn=new_df['label']
yn=LabelEncoder().fit_transform(yn)
Xn_train,Xn_test,yn_train,yn_test=train_test_split(Xn,yn,test_size=0.2)

In [None]:
print(Xn_test.shape,Xn_train.shape)
print(yn_test.shape,yn_train.shape)

In [None]:
def train_and_evaluate_cnn(X_train, y_train, X_test, y_test):
    # Reshape data for CNN
    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

    # Build CNN model
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(50, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Define callbacks
    callbacks = [
        ModelCheckpoint('best_model.keras', monitor='val_accuracy', save_best_only=True, mode='max', verbose=1),
        EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1),
    ]

    # Train the model with callbacks
    history = model.fit(
        X_train, y_train, 
        epochs=20, 
        batch_size=128, 
        validation_data=(X_test, y_test),
        callbacks=callbacks
    )

    # Plot training history
    plt.figure(figsize=(12, 6))

    # Plot training & validation accuracy values
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend(['Train', 'Validation'], loc='upper left')

    # Plot training & validation loss values
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend(['Train', 'Validation'], loc='upper left')

    plt.tight_layout()
    plt.show()

    # Evaluate the model
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

    # Confusion Matrix
    y_pred_proba = model.predict(X_test)
    y_pred = (y_pred_proba > 0.5).astype(int)  
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['0', '1'])
    disp.plot(cmap='Blues', values_format='d')
    plt.title('Confusion Matrix')
    plt.show()

    # Plot ROC Curve
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    plt.figure(figsize=(8, 8))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'AUC = {roc_auc:.2f}')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc='lower right')
    plt.show()

best_model = train_and_evaluate_cnn(X_train, y_train, X_test, y_test)