In [None]:
import pandas as pd
from glob import glob
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score, precision_recall_curve, f1_score 
from sklearn.svm import SVC

#if the package are not installed run following command
#pip install pandas numpy matplotlib seaborn scikit-learn

df = pd.read_csv("C:/Users/PC-LENOVO/OneDrive/Desktop/anomaly detetction/02-14-2018/data.csv") #CSE-CIC-IDS 2018 (02-20-2018)
df
print(df.head())
cols = df.columns
cols = cols.map(lambda x: x.replace(' ', '_') )
df.columns = cols
print(df.head())
query = df.query('Dst_Port == 80 or Dst_Port == 443')
df=query
print('Number of rows:', df.shape[0])
# check for null
df.isnull().any()
# counting infinity in a particular column name
inf=df.isin([np.inf, -np.inf])
inf
#replace infinit number
df=df.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
##df = df[np.isfinite(df).all(1)]
#drop null
df.dropna(how = 'all')
print('Data type of each column of Dataframe :')
df.info(verbose=True)
df = df.drop(columns=['Timestamp', 'Flow_ID', 'Src_IP', 'Dst_IP'])
df
print(df['Label'].value_counts())
df.Label[df.Label=='Benign'] = 0
df.Label[df.Label =='DDoS attacks-LOIC-HTTP'] = 1
print(df['Label'].value_counts())
# Count the number of rows in each class
ddos_count = df[df['Label'] == 1].shape[0]
benign_count = df[df['Label'] == 0].shape[0]

# Randomly sample the benign rows to reduce their count
df_benign = df[df['Label'] == 0].sample(n=ddos_count, random_state=42)

# Combine the DDoS and sampled benign rows into a new dataframe
df_reduced = pd.concat([df[df['Label'] == 1], df_benign])

# Shuffle the rows in the new dataframe
df_reduced = df_reduced.sample(frac=1, random_state=42)

df = df_reduced

print(df['Label'].value_counts())
bening_df = df[df['Label']==0]
malignant_df = df[df['Label']==1]
axes = bening_df.plot(kind='scatter', x='Flow_Duration', y = 'Tot_Fwd_Pkts', color='blue', label='Benign')
malignant_df.plot(kind='scatter', x='Flow_Duration', y = 'Tot_Fwd_Pkts', color='red', label='maligmant', ax=axes)
# Shuffle the rows
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Select a subset of the data for training
num_data = 2000 # can change between 500 and 1152382
train_df = df.iloc[:num_data].copy() # Use .iloc to avoid a SettingWithCopyWarning
train_df = train_df.astype("float64")

print('Data type of each column of Dataframe :')
train_df.info(verbose=True)
df.columns
# Remove Label column from train_df and store it in target variable as nparray
target = np.asanyarray(train_df.pop('Label'))

# create train_df nparray varibel
raw = np.asanyarray(train_df)

print("target array: \n", target,"\n\n","raw array: \n", raw)
# replace infinite values with a large finite value
raw[~np.isfinite(raw)] = np.finfo(raw.dtype).max

# replace NaN values with zero
raw = np.nan_to_num(raw)

raw = np.asanyarray(train_df)
print("is raw isinf: \n",np.isinf(raw),"\n")
print("is raw isfinite: \n",np.isfinite(raw),"\n")
print("is raw nan: \n", np.isnan(raw),"\n")


In [None]:
from sklearn.model_selection import train_test_split

test_size = 0.3 # number of training raws
random_state = 42 # random seed

X_train, X_test, y_train, y_test = train_test_split(raw, target, test_size=test_size, random_state=random_state)


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {
    'kernel': ['rbf'],
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.1, 1, 10]
}

# create a GridSearchCV object
svm_model = SVC()
grid_search = GridSearchCV(svm_model, param_grid, cv=50, n_jobs=-1, verbose=1)

# fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# print the best hyperparameters
print("Best kernel:", grid_search.best_params_['kernel'])
print("Best C:", grid_search.best_params_['C'])
print("Best gamma:", grid_search.best_params_['gamma'],"\n\n")

# Get the best SVM model from the grid search
best_svm_model = grid_search.best_estimator_

# Predict on the test set
y_pred = best_svm_model.predict(X_test)


In [None]:
# Evaluate the model's performance on the test set
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))
# Extract the mean cross-validation scores for each combination of hyperparameters
scores = grid_search.cv_results_['mean_test_score'].reshape(len(param_grid['C']), len(param_grid['gamma']))

# Set up the figure and axes objects
fig, ax = plt.subplots(figsize=(10, 8))

# Create a heatmap of the mean cross-validation scores
im = ax.imshow(scores, cmap='YlGn', interpolation='nearest', vmin=0, vmax=1)

# Set the axis labels and ticks
ax.set_xlabel('Gamma Value', fontsize=14)
ax.set_ylabel('C Value', fontsize=14)
ax.set_xticks(np.arange(len(param_grid['gamma'])))
ax.set_yticks(np.arange(len(param_grid['C'])))
ax.set_xticklabels(param_grid['gamma'], fontsize=12, rotation=45)
ax.set_yticklabels(param_grid['C'], fontsize=12)

# Add annotations for the score values
for i in range(len(param_grid['C'])):
    for j in range(len(param_grid['gamma'])):
        text = ax.text(j, i, '{:.2f}'.format(scores[i, j]), ha='center', va='center', color='k', fontsize=12)

# Add a colorbar
cbar = ax.figure.colorbar(im, ax=ax)
cbar.ax.set_ylabel('Mean CV Score', fontsize=14)
# Set the title of the plot
ax.set_title('Grid Search CV Scores for SVM', fontsize=16, fontweight='bold')

# Show the plot
plt.show()

In [None]:
# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)


# Plot the confusion matrix as a heatmap
plt.figure(figsize=(8,6))
sns.heatmap(conf_matrix, annot=True, cmap="Blues", fmt='g', 
            xticklabels=[0,1], yticklabels=[0,1])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()


In [None]:
import time

# Compute the accuracy score
acc_score = accuracy_score(y_test, y_pred)
print("\nAccuracy:", acc_score)


# Train the SVM model and record the training time
start_time = time.time()
best_svm_model.fit(X_train, y_train)
train_time = time.time() - start_time

# Predict on the test set and record the prediction time
start_time = time.time()
y_pred = best_svm_model.predict(X_test)
pred_time = time.time() - start_time


# Plot the accuracy over time and accuracy
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
fig.suptitle('Accuracy Analysis')

# Plot accuracy over time
ax1.plot([train_time + pred_time], [acc_score], 'bo', label='Accuracy')
ax1.axhline(y=acc_score, color='gray', linestyle='--')
ax1.axvline(x=train_time + pred_time, color='gray', linestyle='--')
ax1.set_xlim([0, train_time + pred_time + 1])
ax1.set_ylim([0, 1])
ax1.set_xlabel('Time (s)')
ax1.set_ylabel('Accuracy')
ax1.legend()

# Plot accuracy
ax2.bar(['SVM'], [acc_score], color='cornflowerblue')
ax2.plot([-0.5, 0.5], [acc_score, acc_score], 'k--', label='Accuracy')
ax2.set_ylim([0, 1])
ax2.legend()

plt.show()
# Classification report
class_report = classification_report(y_test, y_pred)
print("Classification report:\n", class_report)
#plot bar graphs based on data

#create dictiona
class_report_dict = {
    '0.0': {'precision': 0.98, 'recall': 0.93, 'f1-score': 0.96, 'support': 324},
    '1.0': {'precision': 0.92, 'recall': 0.98, 'f1-score': 0.95, 'support': 276}
}
# Set the color palette
palette = sns.color_palette('pastel')

# Get the class names and metrics
class_names = sorted(list(class_report_dict.keys()))
metrics = ['precision', 'recall', 'f1-score']

# Create a figure with subplots for each metric
fig, axs = plt.subplots(1, len(metrics), figsize=(15, 5))

# Plot bar graphs for each metric and class
for i, metric in enumerate(metrics):
    scores = [class_report_dict[class_name][metric] for class_name in class_names]
    ax = sns.barplot(x=class_names, y=scores, ax=axs[i], palette=palette)
    ax.set(title=metric.capitalize() + ' by class', ylabel=metric.capitalize(), ylim=(0, 1), facecolor='white')

# Set the overall title
fig.suptitle('Classification Report', fontsize=16, fontweight='bold')

# Show the plot
plt.show()
# ROC curve
y_prob = best_svm_model.decision_function(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = roc_auc_score(y_test, y_prob)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

#print('fpr value:', fpr, '\n')
#print('tpr value:', tpr, '\n')
#print('thresholds value:', thresholds, '\n')

# Precision-Recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
pr_auc = roc_auc_score(y_test, y_prob)
plt.figure()
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall curve: AP={0:0.2f}'.format(pr_auc))
plt.show()

#print("Precision values:", precision, '\n')
#print("Recall values:", recall,'\n')
#print("AUC of Precision-Recall curve:", pr_auc)


f1 = f1_score(y_test, y_pred)
print("F1 score:", f1)

# Vary the C parameter and calculate F1-score for each value
C_values = [0.1, 1, 10, 100]
f1_scores = []
for c in C_values:
    svm_model = SVC(kernel='rbf', C=c, gamma='scale')
    svm_model.fit(X_train, y_train)
    y_pred = svm_model.predict(X_test)
    f1_scores.append(f1_score(y_test, y_pred))
    print("C:", c, "F1 score:", f1_scores[-1])

# Plot the F1-scores against the C values
plt.plot(C_values, f1_scores)
plt.title('F1-score for SVM model')
plt.xlabel('C values')
plt.ylabel('F1-score')
plt.ylim(0, 1)
plt.xticks(C_values)
plt.show()




In [None]:
from sklearn.preprocessing import StandardScaler


# transform the training and testing data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)