In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib
import re
from sus import badwords,symbols



In [None]:
# req_bad = pd.read_csv('./datasets/2bad_reqff.csv')
# print(req_bad.head())

In [None]:
# req_good = pd.read_csv('./datasets/2good_reqff.csv')
# print(req_good.head())

In [2]:
feature_columns = [
    'path_single_q', 'path_double_q', 'path_dashes', 'path_braces', 'path_spaces', 
    'path_percentages', 'path_semicolons', 'path_angle_brackets', 'path_special_chars', 
    'path_badwords_count', 'body_single_q', 'body_double_q', 'body_dashes', 'body_braces', 
    'body_spaces', 'body_percentages', 'body_semicolons', 'body_angle_brackets', 
    'body_special_chars', 'body_badwords_count', 'path_length', 'body_length'
]

to_drop = [
    'single_q', 'double_q', 'dashes', 'braces', 'spaces', 
    'percentages', 'semicolons', 'angle_brackets', 'special_chars', 
    'badwords_count','path_length','body_length'
]

In [3]:
all_data = pd.read_csv('./datasets/all_datas_f.csv')


In [4]:
all_data.drop(columns=to_drop,inplace=True)
all_data.fillna("",inplace=True)


In [5]:
all_data

Unnamed: 0,method,path,body,class
0,POST,/doLogin,uid=ZAP&passw=ZAP&btnSubmit=Login,1
1,POST,/sendFeedback,cfile=comments.txt&name=ZAP&email_addr=ZAP&sub...,1
2,GET,/admin/clients.xls,,1
3,GET,/my%20documents/JohnSmith/Bank%20Site%20Docume...,,1
4,GET,/my%20documents/JohnSmith/Bank%20Site%20Docume...,,1
...,...,...,...,...
517,GET,/static/index.jsp?content=%3Cxsl%3Avalue-of+se...,,1
518,GET,/search.jsp?query=system-property%28%27xsl%3Av...,,1
519,POST,/doLogin,uid=%3Cxsl%3Avariable+name%3D%22rtobject%22+se...,1
520,GET,/ws_ftp.ini,,1


In [6]:
def count_symbols(text, symbols):
    if pd.isna(text):
        return 0
    return sum(text.count(symbol) for symbol in symbols)

def count_bad_words(text, badwords):
    if pd.isna(text):
        return 0
    return sum(1 for word in badwords if word in text.lower())

In [7]:
import nltk
nltk.download('punkt_tab')


[nltk_data] Error loading punkt_tab: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


False

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from nltk.tokenize import word_tokenize
import numpy as np

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=100)
combined_text = all_data['path'].astype(str) + " " + all_data['body'].astype(str)
tfidf_vectorizer.fit(combined_text)
joblib.dump(tfidf_vectorizer,'tfidf.joblib')

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    return ' '.join(tokens)

def extract_features(row):
    path = row['path']
    body = row['body']
    
    features = {
        'path_length': len(path),
        'body_length': len(body),
        'body_sentiment': TextBlob(body).sentiment.polarity,
        'path_sentiment': TextBlob(path).sentiment.polarity
    }
    
    # Vectorize text data
    processed_body = preprocess_text(body)
    processed_path = preprocess_text(path)
    
    # Get average TF-IDF score
    features['tfidf_body'] = np.mean(tfidf_vectorizer.transform([processed_body]).toarray(), axis=1)[0]
    features['tfidf_path'] = np.mean(tfidf_vectorizer.transform([processed_path]).toarray(), axis=1)[0]
    
    return pd.Series(features)

# Apply feature extraction to each row
data_features = all_data.apply(extract_features, axis=1)
data = pd.concat([all_data, data_features], axis=1)

# Remove original text columns
data = data.drop(columns=['body','path','method'])


In [9]:
data

Unnamed: 0,class,path_length,body_length,body_sentiment,path_sentiment,tfidf_body,tfidf_path
0,1,8.0,33.0,0.0,0.0,0.021982,0.010000
1,1,13.0,124.0,0.0,0.0,0.035967,0.010000
2,1,18.0,0.0,0.0,0.0,0.000000,0.000000
3,1,63.0,0.0,0.0,0.0,0.000000,0.022502
4,1,88.0,0.0,0.0,0.0,0.000000,0.025939
...,...,...,...,...,...,...,...
517,1,104.0,0.0,0.0,0.0,0.000000,0.036701
518,1,63.0,0.0,0.0,0.0,0.000000,0.027007
519,1,8.0,385.0,0.0,0.0,0.035783,0.010000
520,1,11.0,0.0,0.0,0.0,0.000000,0.010000


In [10]:
from sklearn.model_selection import train_test_split

X = data.drop(columns=['class'])
y = data['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the model
import joblib
joblib.dump(model, 'request_threat_model.pkl')

# Evaluate the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9238095238095239
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.98      0.92        48
           1       0.98      0.88      0.93        57

    accuracy                           0.92       105
   macro avg       0.93      0.93      0.92       105
weighted avg       0.93      0.92      0.92       105



In [20]:
for col in ['path', 'body']:
    all_data[f'{col}_single_q'] = all_data[col].apply(lambda x: count_symbols(x, ["'"]))
    all_data[f'{col}_double_q'] = all_data[col].apply(lambda x: count_symbols(x, ['"']))
    all_data[f'{col}_dashes'] = all_data[col].apply(lambda x: count_symbols(x, ["--"]))
    all_data[f'{col}_braces'] = all_data[col].apply(lambda x: count_symbols(x, ["{", "}"]))
    all_data[f'{col}_spaces'] = all_data[col].apply(lambda x: count_symbols(x, [" "]))
    all_data[f'{col}_percentages'] = all_data[col].apply(lambda x: count_symbols(x, ["%"]))
    all_data[f'{col}_semicolons'] = all_data[col].apply(lambda x: count_symbols(x, [";"]))
    all_data[f'{col}_angle_brackets'] = all_data[col].apply(lambda x: count_symbols(x, ["<", ">"]))
    all_data[f'{col}_special_chars'] = all_data[col].apply(lambda x: sum(1 for c in str(x) if not c.isalnum() and not c.isspace()) if pd.notna(x) else 0)
    all_data[f'{col}_badwords_count'] = all_data[col].apply(lambda x: count_bad_words(x, badwords))

# General features based on length


In [21]:
all_data['path_length'] = all_data['path'].apply(lambda x: len(x) if pd.notna(x) else 0)
all_data['body_length'] = all_data['body'].apply(lambda x: len(x) if pd.notna(x) else 0)

In [None]:
print(all_data.head())

In [34]:
X = all_data[feature_columns]
y = all_data['class']

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [42]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97        80
           1       0.99      0.95      0.97        77

    accuracy                           0.97       157
   macro avg       0.97      0.97      0.97       157
weighted avg       0.97      0.97      0.97       157



In [37]:
joblib.dump(model, 'request_threat_model.pkl')
print("Model saved as 'request_threat_model.pkl'")

Model saved as 'request_threat_model.pkl'
