In [16]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [90]:
data = pd.read_csv('web-attacks/all.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53910 entries, 0 to 53909
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Payload     53910 non-null  object
 1   Label       53910 non-null  int64 
 2   text_label  53910 non-null  object
 3   ID          53910 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 1.6+ MB


##### ***0 -> Normal***
##### ***1 -> XSS***
##### ***2 -> SQL Injection***

In [91]:
data.drop(['ID', 'text_label'], axis=1, inplace=True)

In [93]:
X = data['Payload']
y = data['Label']

vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

#### ***RANDOM FORESR***

In [94]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9913745130773511
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      4125
           1       1.00      1.00      1.00      3663
           2       0.97      1.00      0.98      2994

    accuracy                           0.99     10782
   macro avg       0.99      0.99      0.99     10782
weighted avg       0.99      0.99      0.99     10782



In [96]:
sample = "javascript:'&#x25;&#x33;&#x43;&#x73;&#x63;&#x72;&#"
sample = vectorizer.transform([sample])

print(rf.predict(sample))

[1]


In [95]:
import joblib

joblib.dump(rf, 'rf_model_2.pkl')

['rf_model_2.pkl']

In [98]:
joblib.dump(vectorizer, 'vectorizer_2.pkl')

['vectorizer_2.pkl']

In [97]:
# Load the saved Random Forest model
rf_model = joblib.load('rf_model_2.pkl')

# Test the model with the given sample
sample = "<xss onafterscriptexecute=alert(1)><script>1</script>"
sample_vectorized = vectorizer.transform([sample])

# Predict the label
predicted_label = rf_model.predict(sample_vectorized)
print(predicted_label)

[1]


### ***LSTM***

In [74]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [85]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D

In [78]:
data = pd.read_csv('web-attacks/all.csv')

In [82]:
X = data["Payload"].astype(str).values
y = data["Label"].values

tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)

X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences, maxlen=200, padding='post', truncating='post')


X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

In [88]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=200),
    LSTM(128, return_sequences=True),
    Dropout(0.3),
    LSTM(64),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test)) 

Epoch 1/10
[1m1348/1348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 137ms/step - accuracy: 0.3841 - loss: 1.0915 - val_accuracy: 0.3826 - val_loss: 1.0905
Epoch 2/10
[1m1348/1348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 144ms/step - accuracy: 0.3832 - loss: 1.0910 - val_accuracy: 0.3826 - val_loss: 1.0905
Epoch 3/10
[1m1348/1348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 140ms/step - accuracy: 0.3905 - loss: 1.0886 - val_accuracy: 0.3826 - val_loss: 1.0902
Epoch 4/10
[1m1348/1348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 139ms/step - accuracy: 0.3840 - loss: 1.0909 - val_accuracy: 0.3826 - val_loss: 1.0905
Epoch 5/10
[1m1348/1348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 141ms/step - accuracy: 0.3854 - loss: 1.0900 - val_accuracy: 0.3826 - val_loss: 1.0903
Epoch 6/10
[1m1348/1348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 139ms/step - accuracy: 0.3844 - loss: 1.0903 - val_accuracy: 0.3826 - val_loss:

<keras.src.callbacks.history.History at 0x1cb09ae9fd0>

In [89]:
model.save('lstm_model_ver1.h5')
loaded_model = tf.keras.models.load_model('lstm_model_ver1.h5')

sample = "<xss onafterscriptexecute=alert(1)><script>1</script>"
sample_sequence = tokenizer.texts_to_sequences([sample])
sample_padded = pad_sequences(sample_sequence, maxlen=200, padding='post', truncating='post')

prediction = loaded_model.predict(sample_padded)
predicted_label = prediction.argmax(axis=-1)[0]
print(predicted_label)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 420ms/step
0
