In [1]:
import pandas as pd
import requests

from bs4 import BeautifulSoup
from taskA import extract_table

In [2]:
def load_http_log():
    # Extract dataset description columns from data source website
    url = 'https://www.secrepo.com/Datasets%20Description/Network/http.html'
    req = requests.get(url)
    soup = BeautifulSoup(req.content, 'html.parser')

    tables = soup.find_all('table')

    # column names are in second table
    table = extract_table(tables[1])
    column_names = table[''].tolist()
    dtypes = table['Data Type'].tolist()

    with open('data/http.log', 'r') as f:
        results = f.readlines()

    results = [result.split('\t') for result in results]

    res = pd.DataFrame(results, columns=column_names)
    res['resp_mime_types'] = res['resp_mime_types'].str.strip()
    return res

In [3]:
df = load_http_log()

In [4]:
df['ts'] = df['ts'].astype(float)
df.head()

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,trans_depth,method,host,uri,...,info_msg,filename,tags,username,password,proxied,orig_fuids,orig_mime_types,resp_fuids,resp_mime_types
0,1331901000.0,CHEt7z3AzG4gyCNgci,192.168.202.79,50465,192.168.229.251,80,1,HEAD,192.168.229.251,/DEASLog02.nsf,...,-,-,(empty),-,-,-,-,-,-,-
1,1331901000.0,CKnDAp2ohlvN6rpiXl,192.168.202.79,50467,192.168.229.251,80,1,HEAD,192.168.229.251,/DEASLog03.nsf,...,-,-,(empty),-,-,-,-,-,-,-
2,1331901000.0,CNTrjn42F3LB58MZH6,192.168.202.79,50469,192.168.229.251,80,1,HEAD,192.168.229.251,/DEASLog04.nsf,...,-,-,(empty),-,-,-,-,-,-,-
3,1331901000.0,C1D7mK1PlzKEnEyG03,192.168.202.79,50471,192.168.229.251,80,1,HEAD,192.168.229.251,/DEASLog05.nsf,...,-,-,(empty),-,-,-,-,-,-,-
4,1331901000.0,CGF1bVMyl9ALKI32l,192.168.202.79,50473,192.168.229.251,80,1,HEAD,192.168.229.251,/DEASLog.nsf,...,-,-,(empty),-,-,-,-,-,-,-


# Overview of Algorithm

The algorithm makes use of rules to shortlist IPs that could be running reconnaissance activities against an enterprise web server.

1. Cross Site Scripting
2. SQL Injections
3. Insecure Direct Object Reference
4. Brute Force Attacks
5. Unusual HTTP Methods
6. Detecting search of credentials

Cross Site Scripting - Search for html tags using '<' or '>' in uri which suggests that a script may be used

SQL Injections - Search for SQL clauses, SQL commenting and using value = value in uri or user agent

Insecure Direct Object Reference - Attacker may try to check if they can access other directories through insecure direct object reference. This can be detected by checking if there is '../' which indicates referencing to other directories.

Brute Force Attack - Attacker tries to brute force the password or authentication, resulting in high numbers of 401 and 403 status code

Unusual HTTP Methods - Unusual HTTP Methods can be signs of reconnaissance activities

Detecting search of credentials - Attackers may try random search for credentials by adding 'passwd' or 'password' in uri or user agent

The entire algorithm is a rule-based algorithm that flags any supicious IP address from the above listed malicious activities.

## Cross Site Scripting
Cross site scripting can be detected by checking for html tags in the uri. 

The IP address of the sender may be a victim but may also be suspcious. The referrer of this row is also suspicious as the referrer is the one that resulted in the sender sending this particular request with XSS.

In [27]:
def detect_XSS(df, col='uri', ip_col='id.orig_h'):
    """
    Shortlists IP addresses that perform XSS
    :param df: df containing all data
    :param col: column that may contain the script
    """

    # Find XSS in the uri
    xss_df = df[(df[col].str.contains('<')) & (df[col].str.contains('>'))].copy()
    result = set(xss_df[ip_col])

    # Find IP address of referrer
    referrers = xss_df['referrer'].str.findall('\d{3}\.\d{3}\.\d{2}\.\d{3}')
    referrers = set(referrers.explode().dropna())

    result = result.union(referrers)
    
    return result

In [31]:
XSS_ip = detect_XSS(df).union(detect_XSS(df, col='user_agent'))
XSS_ip

{'192.168.202.102',
 '192.168.202.110',
 '192.168.202.118',
 '192.168.202.125',
 '192.168.202.138',
 '192.168.202.140',
 '192.168.202.76',
 '192.168.202.79',
 '192.168.202.96',
 '192.168.203.63',
 '192.168.21.253',
 '192.168.22.202',
 '192.168.22.253',
 '192.168.23.103',
 '192.168.23.202',
 '192.168.24.253',
 '192.168.25.253',
 '192.168.26.202',
 '192.168.27.253',
 '192.168.28.202'}

## Detecting SQL Injections
SQL Injections have clauses like SELECT, UNION, <value> = <value>, contain SQL comments, single quotes and semicolons

In [43]:
def detect_sql_injections(df, col='uri', ip_col='id.orig_h'):
    """
    Shortlists IP addresses that perform SQL Injections
    :param df: df containing all data
    :param col: column that may contain the script
    """

    ip_addresses = set()

    # IP addresses with URIs that contain select and union
    ip_addresses = ip_addresses.union(
        set(df[(df[col].str.lower().str.contains('union'))&(df['uri'].str.lower().str.contains('select'))][ip_col])
    )

    # Contain 'select
    ip_addresses = ip_addresses.union(
        set(df[(df[col].str.lower().str.contains('\'(select|union|insert|update|delete|replace)', regex=True))][ip_col])
    )

    # Contain comments of form /**
    ip_addresses = ip_addresses.union(
        set(df[(df[col].str.lower().str.contains('/**', regex=False))][ip_col])
    )

    # Contain comments of form --
    ip_addresses = ip_addresses.union(
        set(df[(df[col].str.lower().str.contains('--', regex=False))][ip_col])
    )
    
    return ip_addresses

In [44]:
sql_injection_ip = detect_sql_injections(df).union(detect_sql_injections(df, col='user_agent'))
sql_injection_ip

  set(df[(df[col].str.lower().str.contains('\'(select|union|insert|update|delete|replace)', regex=True))][ip_col])


{'192.168.202.102',
 '192.168.202.110',
 '192.168.202.118',
 '192.168.202.125',
 '192.168.202.138',
 '192.168.202.140',
 '192.168.202.150',
 '192.168.202.79',
 '192.168.202.96',
 '192.168.203.63'}

## Insecure Direct Object Reference
Check for movement into different file directories

In [32]:
def detect_DOR(df, col='uri', ip_col='id.orig_h'):
    """
    Shortlists IP addresses that perform Insecure Direct Object Reference
    :param df: df containing all data
    :param col: column that may contain the script
    """
    return set(df[(df[col].str.contains('../', regex=False))][ip_col])

In [33]:
DOR_ip = detect_DOR(df).union(detect_DOR(df, col='user_agent'))
DOR_ip

{'192.168.202.100',
 '192.168.202.102',
 '192.168.202.110',
 '192.168.202.112',
 '192.168.202.118',
 '192.168.202.125',
 '192.168.202.138',
 '192.168.202.140',
 '192.168.202.150',
 '192.168.202.4',
 '192.168.202.79',
 '192.168.202.96',
 '192.168.203.45',
 '192.168.203.61',
 '192.168.203.63',
 '2001:dbb:c18:202:20c:29ff:fe41:4be7',
 '2001:dbb:c18:202:20c:29ff:fe93:571e'}

## Brute Force Attacks

In [52]:
def detect_brute_force(df, status_code_col='status_code', ip_col='id.orig_h', threshold=100):

    suspicious_requests = df[df[status_code_col].isin(['401', '403'])][[status_code_col, ip_col]]
    suspicious_ip = suspicious_requests.groupby(ip_col).count().reset_index()
    suspicious_ip = suspicious_ip[suspicious_ip[status_code_col] > threshold]

    return set(suspicious_ip[ip_col])

In [53]:
detect_brute_force(df)

{'192.168.202.102',
 '192.168.202.110',
 '192.168.202.138',
 '192.168.202.140',
 '192.168.202.68',
 '192.168.202.79'}

## Unusual HTTP Methods
Unusual and uncommon HTTP methods may indicate a search of vulnerabilities

In [47]:
def detect_abnormal_methods(df, col, ip_col='id.orig_h'):
    common_methods = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE']

    return set(df[~df[col].isin(common_methods)][ip_col])

In [48]:
detect_abnormal_methods(df, 'method')

{'192.168.202.100',
 '192.168.202.101',
 '192.168.202.102',
 '192.168.202.108',
 '192.168.202.110',
 '192.168.202.112',
 '192.168.202.115',
 '192.168.202.118',
 '192.168.202.125',
 '192.168.202.136',
 '192.168.202.138',
 '192.168.202.140',
 '192.168.202.144',
 '192.168.202.153',
 '192.168.202.4',
 '192.168.202.79',
 '192.168.202.80',
 '192.168.202.88',
 '192.168.202.90',
 '192.168.202.94',
 '192.168.202.96',
 '192.168.203.45',
 '192.168.203.61',
 '192.168.204.45',
 '2001:dbb:c18:202:20c:29ff:fe93:571e'}

## Detecting searching of credentials
Attackers append passwd or password to user agent or the uris to attempt to steal credentials.

In [45]:
def detect_cred_steal(df, col, ip_col='id.orig_h'):
    return set(df[(df[col].str.contains('(passwd)|(password)'))][ip_col])

In [46]:
cred_steal_ip = detect_cred_steal(df, 'uri').union(detect_cred_steal(df, 'user_agent'))
cred_steal_ip

  return set(df[(df[col].str.contains('(passwd)|(password)'))][ip_col])


{'192.168.202.102',
 '192.168.202.103',
 '192.168.202.110',
 '192.168.202.112',
 '192.168.202.118',
 '192.168.202.125',
 '192.168.202.138',
 '192.168.202.140',
 '192.168.202.79',
 '192.168.202.94',
 '192.168.202.96',
 '192.168.203.63',
 '192.168.203.64',
 '192.168.204.45',
 '192.168.204.70'}

# Overall Algorithm

In [54]:
def detect_XSS(df, col='uri', ip_col='id.orig_h'):
    """
    Shortlists IP addresses that perform XSS
    :param df: df containing all data
    :param col: column that may contain the script
    """

    # Find XSS in the uri
    xss_df = df[(df[col].str.contains('<')) & (df[col].str.contains('>'))].copy()
    result = set(xss_df[ip_col])

    # Find IP address of referrer
    referrers = xss_df['referrer'].str.findall('\d{3}\.\d{3}\.\d{2}\.\d{3}')
    referrers = set(referrers.explode().dropna())

    result = result.union(referrers)
    
    return result

def detect_sql_injections(df, col='uri', ip_col='id.orig_h'):
    """
    Shortlists IP addresses that perform SQL Injections
    :param df: df containing all data
    :param col: column that may contain the script
    """

    ip_addresses = set()

    # IP addresses with URIs that contain select and union
    ip_addresses = ip_addresses.union(
        set(df[(df[col].str.lower().str.contains('union'))&(df['uri'].str.lower().str.contains('select'))][ip_col])
    )

    # Contain 'select
    ip_addresses = ip_addresses.union(
        set(df[(df[col].str.lower().str.contains('\'(select|union|insert|update|delete|replace)', regex=True))][ip_col])
    )

    # Contain comments of form /**
    ip_addresses = ip_addresses.union(
        set(df[(df[col].str.lower().str.contains('/**', regex=False))][ip_col])
    )

    # Contain comments of form --
    ip_addresses = ip_addresses.union(
        set(df[(df[col].str.lower().str.contains('--', regex=False))][ip_col])
    )
    
    return ip_addresses

def detect_DOR(df, col='uri', ip_col='id.orig_h'):
    """
    Shortlists IP addresses that perform Insecure Direct Object Reference
    :param df: df containing all data
    :param col: column that may contain the script
    """
    return set(df[(df[col].str.contains('../', regex=False))][ip_col])


def detect_brute_force(df, status_code_col='status_code', ip_col='id.orig_h', threshold=100):

    suspicious_requests = df[df[status_code_col].isin(['401', '403'])][[status_code_col, ip_col]]
    suspicious_ip = suspicious_requests.groupby(ip_col).count().reset_index()
    suspicious_ip = suspicious_ip[suspicious_ip[status_code_col] > threshold]

    return set(suspicious_ip[ip_col])

def detect_abnormal_methods(df, col, ip_col='id.orig_h'):
    common_methods = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE']

    return set(df[~df[col].isin(common_methods)][ip_col])

def detect_cred_steal(df, col, ip_col='id.orig_h'):
    return set(df[(df[col].str.contains('(passwd)|(password)'))][ip_col])

In [56]:
def flag_supicious_IP(df):
    result = detect_XSS(df, 'uri').union(detect_XSS(df, 'user_agent'))

    result = result.union(detect_sql_injections(df, 'uri')).union(detect_sql_injections(df, 'user_agent'))

    result = result.union(detect_brute_force(df, 'status_code'))

    result = result.union(detect_DOR(df, 'uri')).union(detect_DOR(df, 'user_agent'))

    result = result.union(detect_abnormal_methods(df, 'method'))

    result = result.union(detect_cred_steal(df, 'uri')).union(detect_cred_steal(df, 'user_agent'))

    return result
    

In [58]:
suspicious_ip = flag_supicious_IP(df)

  set(df[(df[col].str.lower().str.contains('\'(select|union|insert|update|delete|replace)', regex=True))][ip_col])
  return set(df[(df[col].str.contains('(passwd)|(password)'))][ip_col])


In [59]:
len(suspicious_ip)

43

In [60]:
suspicious_ip

{'192.168.202.100',
 '192.168.202.101',
 '192.168.202.102',
 '192.168.202.103',
 '192.168.202.108',
 '192.168.202.110',
 '192.168.202.112',
 '192.168.202.115',
 '192.168.202.118',
 '192.168.202.125',
 '192.168.202.136',
 '192.168.202.138',
 '192.168.202.140',
 '192.168.202.144',
 '192.168.202.150',
 '192.168.202.153',
 '192.168.202.4',
 '192.168.202.68',
 '192.168.202.76',
 '192.168.202.79',
 '192.168.202.80',
 '192.168.202.88',
 '192.168.202.90',
 '192.168.202.94',
 '192.168.202.96',
 '192.168.203.45',
 '192.168.203.61',
 '192.168.203.63',
 '192.168.203.64',
 '192.168.204.45',
 '192.168.204.70',
 '192.168.21.253',
 '192.168.22.202',
 '192.168.22.253',
 '192.168.23.103',
 '192.168.23.202',
 '192.168.24.253',
 '192.168.25.253',
 '192.168.26.202',
 '192.168.27.253',
 '192.168.28.202',
 '2001:dbb:c18:202:20c:29ff:fe41:4be7',
 '2001:dbb:c18:202:20c:29ff:fe93:571e'}

# References
Meyer, R. (n.d.). Detecting attacks web applications log files - giac.org. https://www.giac.org/paper/gcia/1996/detecting-attacks-web-applications-log-files/106864 

Manners, D. (2011, October 20). The User Agent Field: Analyzing and Detecting the Abnormal or Malicious in your Organization. https://sansorg.egnyte.com/dl/pGWQkGIq5N 