In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib
import re

In [35]:
bad_words = [
    "select", "from", "where", "insert", "into", "values", "update", "set", "delete", "drop",
    "table", "database", "schema", "union", "all", "concat", "group_concat", "column_name",
    "information_schema", "sys.schemas", "user()", "current_user()", "session_user()",
    "system_user()", "database()", "version()", "@@version", "@@datadir", "@@basedir",
    "eval", "exec", "execute", "call", "proc", "procedure", "shell", "system", "os.system",
    "os.popen", "popen", "pcntl_exec", "assert", "passthru", "dl", "opendir", "readdir",
    "mkdir", "rmdir", "unlink", "chmod", "chown", "symlink", "link", "uname", "whoami",
    "getenv", "putenv", "gethost", "gethostbyname", "dns_get_record", "dns_get_mx",
    "php_uname", "phpinfo", "phpversion", "highlight_file", "show_source", "config_path",
    "document.cookie", "document.write", "window.location", "window.navigator.userAgent",
    "location.href", "location.host", "location.pathname", "location.protocol", "alert",
    "prompt", "confirm", "iframe", "script", "img", "svg", "base64_decode", "base64_encode",
    "hex2bin", "bin2hex", "urldecode", "urlencode", "rawurldecode", "rawurlencode",
    "md5", "sha1", "sha256", "sha384", "sha512", "crc32", "crypt", "getimagesizefromstring",
    "exif_read_data", "exif_thumbnail", "exif_imagetype", "gd_info", "getimagesize",
    "getimagesizefromstring", "imagecreatefromstring", "parse_url", "parse_str",
    "http_build_query", "getallheaders", "apache_request_headers", "get_headers",
    "get_included_files", "get_loaded_extensions", "get_defined_constants",
    "get_defined_functions", "get_declared_classes", "get_declared_interfaces",
    "get_declared_traits", "get_class_methods", "get_class_vars", "get_class_props",
    "get_object_vars", "get_parent_class", "class_exists", "interface_exists",
    "trait_exists", "method_exists", "property_exists", "is_subclass_of", "is_a",
    "get_called_class", "get_class", "get_this_class", "get_class_this",
    "get_class_intro", "get_class_methods", "get_class_vars", "get_class_props"
]

In [36]:
data = pd.read_csv('./datasets/all_datas_f.csv')

synth = pd.read_csv("./synthetic_malicious_requests_with_json.csv")

In [37]:
data = pd.concat([data,synth])

In [38]:
data

Unnamed: 0,method,path,body,single_q,double_q,dashes,braces,spaces,percentages,semicolons,angle_brackets,special_chars,path_length,body_length,badwords_count,class
0,POST,/doLogin,uid=ZAP&passw=ZAP&btnSubmit=Login,0,0,0,0,0,0,0,0,0,8,33,1,1
1,POST,/sendFeedback,cfile=comments.txt&name=ZAP&email_addr=ZAP&sub...,0,0,0,0,7,0,0,0,0,13,124,0,1
2,GET,/admin/clients.xls,,0,0,0,0,0,0,0,0,0,18,0,1,1
3,GET,/my%20documents/JohnSmith/Bank%20Site%20Docume...,,0,0,0,0,3,0,0,0,0,57,0,0,1
4,GET,/my%20documents/JohnSmith/Bank%20Site%20Docume...,,0,0,0,0,3,0,0,0,0,82,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,POST,/login.php?user=root&pass=secret' OR 1=1;--,"{""data"": ""<script>alert('XSS')</script>""}",2,4,2,0,2,0,0,4,3,43,41,1,1
996,POST,/admin?username=admin' OR '1'='1&password=1234,"{ 'username': 'admin', 'password': 'password12...",12,0,0,0,2,0,0,0,2,46,61,0,1
997,GET,/api/v1/users;DROP TABLE users;,"{""search"": ""DROP TABLE users;""}",0,4,0,0,2,0,1,0,0,31,31,0,1
998,POST,/admin?username=admin' OR '1'='1&password=1234,"{ 'username': 'admin', 'password': 'password12...",12,0,0,0,2,0,0,0,2,46,61,0,1


In [39]:
to_drop = [
    'single_q', 'double_q', 'dashes', 'braces', 'spaces', 
    'percentages', 'semicolons', 'angle_brackets', 'special_chars', 
    'badwords_count','path_length','body_length','method'
]

data.drop(columns=to_drop,inplace=True)
data

Unnamed: 0,path,body,class
0,/doLogin,uid=ZAP&passw=ZAP&btnSubmit=Login,1
1,/sendFeedback,cfile=comments.txt&name=ZAP&email_addr=ZAP&sub...,1
2,/admin/clients.xls,,1
3,/my%20documents/JohnSmith/Bank%20Site%20Docume...,,1
4,/my%20documents/JohnSmith/Bank%20Site%20Docume...,,1
...,...,...,...
995,/login.php?user=root&pass=secret' OR 1=1;--,"{""data"": ""<script>alert('XSS')</script>""}",1
996,/admin?username=admin' OR '1'='1&password=1234,"{ 'username': 'admin', 'password': 'password12...",1
997,/api/v1/users;DROP TABLE users;,"{""search"": ""DROP TABLE users;""}",1
998,/admin?username=admin' OR '1'='1&password=1234,"{ 'username': 'admin', 'password': 'password12...",1


In [40]:
data.fillna("",inplace=True)

In [41]:

# Extract features from the path and body
data['path_length'] = data['path'].str.len()
data['path_params'] = data['path'].str.count('=')
data['path_dashes'] = data['path'].str.count('-')
data['path_braces'] = data['path'].str.count('{') + data['path'].str.count('}')
data['path_spaces'] = data['path'].str.count(' ')

data['body_length'] = data['body'].str.len()
data['body_percentages'] = data['body'].str.count('%')
data['body_semicolons'] = data['body'].str.count(';')
data['body_angle_brackets'] = data['body'].str.count('<') + data['body'].str.count('>')

# Extract special characters count
data['body_special_chars'] = 0
for char in '!@#$&*()_+=-|\\/?,.':
    if char in data['body']:
        data['body_special_chars'] += data['body'].str.count(char)

# Extract bad words count
data['body_badwords_count'] = 0
for word in bad_words:
    data['body_badwords_count'] += data['body'].str.contains(word, case=False).astype(int)

# Split the data into training and testing sets
X = data[['path_length', 'path_params', 'path_dashes', 'path_braces', 'path_spaces', 'body_length', 'body_percentages', 'body_semicolons', 'body_angle_brackets', 'body_special_chars', 'body_badwords_count']]
y = data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Gradient Boosting Classifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
clf.fit(X_train, y_train)

# Save the trained model
joblib.dump(clf, 'malicious_request_detector.pkl')

# Evaluate the model
y_pred = clf.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1-score:', f1_score(y_test, y_pred))

  data['body_badwords_count'] += data['body'].str.contains(word, case=False).astype(int)
  data['body_badwords_count'] += data['body'].str.contains(word, case=False).astype(int)
  data['body_badwords_count'] += data['body'].str.contains(word, case=False).astype(int)
  data['body_badwords_count'] += data['body'].str.contains(word, case=False).astype(int)
  data['body_badwords_count'] += data['body'].str.contains(word, case=False).astype(int)
  data['body_badwords_count'] += data['body'].str.contains(word, case=False).astype(int)


Accuracy: 0.9824945295404814
Precision: 0.9769230769230769
Recall: 0.9621212121212122
F1-score: 0.9694656488549618
