In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib
import re
from sus import badwords,symbols

In [2]:
# req_bad = pd.read_csv('./datasets/2bad_reqff.csv')
# print(req_bad.head())

In [3]:
# req_good = pd.read_csv('./datasets/2good_reqff.csv')
# print(req_good.head())

In [18]:
feature_columns = [
    'single_q', 'double_q', 'dashes', 'braces', 'spaces', 
    'percentages', 'semicolons', 'angle_brackets', 'special_chars', 
    'badwords_count','path_length', 'body_length'
]

pred_cols =  [
    'path_single_q', 'path_double_q', 'path_dashes', 'path_braces', 'path_spaces', 
    'path_percentages', 'path_semicolons', 'path_angle_brackets', 'path_special_chars', 
    'path_badwords_count', 'body_single_q', 'body_double_q', 'body_dashes', 'body_braces', 
    'body_spaces', 'body_percentages', 'body_semicolons', 'body_angle_brackets', 
    'body_special_chars', 'body_badwords_count', 'path_length', 'body_length'
]


In [14]:
all_data = pd.read_csv('./datasets/all_datas_f.csv')


In [None]:
all_data

In [15]:
all_data.drop(columns=feature_columns,inplace=True)
all_data.fillna("",inplace=True)


In [9]:
def count_symbols(text, symbols):
    if pd.isna(text):
        return 0
    return sum(text.count(symbol) for symbol in symbols)

def count_bad_words(text, badwords):
    if pd.isna(text):
        return 0
    return sum(1 for word in badwords if word in text.lower())

In [16]:
for col in ['path', 'body']:
    all_data[f'{col}_single_q'] = all_data[col].apply(lambda x: count_symbols(x, ["'"]))
    all_data[f'{col}_double_q'] = all_data[col].apply(lambda x: count_symbols(x, ['"']))
    all_data[f'{col}_dashes'] = all_data[col].apply(lambda x: count_symbols(x, ["--"]))
    all_data[f'{col}_braces'] = all_data[col].apply(lambda x: count_symbols(x, ["{", "}"]))
    all_data[f'{col}_spaces'] = all_data[col].apply(lambda x: count_symbols(x, [" "]))
    all_data[f'{col}_percentages'] = all_data[col].apply(lambda x: count_symbols(x, ["%"]))
    all_data[f'{col}_semicolons'] = all_data[col].apply(lambda x: count_symbols(x, [";"]))
    all_data[f'{col}_angle_brackets'] = all_data[col].apply(lambda x: count_symbols(x, ["<", ">"]))
    all_data[f'{col}_special_chars'] = all_data[col].apply(lambda x: sum(1 for c in str(x) if not c.isalnum() and not c.isspace()) if pd.notna(x) else 0)
    all_data[f'{col}_badwords_count'] = all_data[col].apply(lambda x: count_bad_words(x, badwords))

# General features based on length


In [21]:
all_data['path_length'] = all_data['path'].apply(lambda x: len(x) if pd.notna(x) else 0)
all_data['body_length'] = all_data['body'].apply(lambda x: len(x) if pd.notna(x) else 0)

In [22]:
all_data

Unnamed: 0,method,path,body,class,path_single_q,path_double_q,path_dashes,path_braces,path_spaces,path_percentages,...,body_dashes,body_braces,body_spaces,body_percentages,body_semicolons,body_angle_brackets,body_special_chars,body_badwords_count,path_length,body_length
0,POST,/doLogin,uid=ZAP&passw=ZAP&btnSubmit=Login,1,0,0,0,0,0,0,...,0,0,0,0,0,0,5,0,8,33
1,POST,/sendFeedback,cfile=comments.txt&name=ZAP&email_addr=ZAP&sub...,1,0,0,0,0,0,0,...,0,0,0,0,0,0,21,0,13,124
2,GET,/admin/clients.xls,,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,18,0
3,GET,/my%20documents/JohnSmith/Bank%20Site%20Docume...,,1,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,63,0
4,GET,/my%20documents/JohnSmith/Bank%20Site%20Docume...,,1,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,88,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,GET,/static/index.jsp?content=%3Cxsl%3Avalue-of+se...,,1,0,0,0,0,0,12,...,0,0,0,0,0,0,0,0,104,0
518,GET,/search.jsp?query=system-property%28%27xsl%3Av...,,1,0,0,0,0,0,7,...,0,0,0,0,0,0,0,0,63,0
519,POST,/doLogin,uid=%3Cxsl%3Avariable+name%3D%22rtobject%22+se...,1,0,0,0,0,0,0,...,0,0,0,55,0,0,69,3,8,385
520,GET,/ws_ftp.ini,,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,11,0


In [23]:
X = all_data[pred_cols]
y = all_data['class']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [25]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97        80
           1       0.99      0.95      0.97        77

    accuracy                           0.97       157
   macro avg       0.97      0.97      0.97       157
weighted avg       0.97      0.97      0.97       157



In [26]:
joblib.dump(model, 'request_threat_model.pkl')
print("Model saved as 'request_threat_model.pkl'")

Model saved as 'request_threat_model.pkl'
