<a href="https://colab.research.google.com/github/u-haru/log-inspector/blob/master/IsolateAnnormalyLog.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Parse access.log to log.csv 
import pandas as pd
import re
from urllib.parse import unquote

lineformat = re.compile(r'(?P<ipaddress>[\w\.:]*) [^\[]* \[(?P<date>.*)\] "(?P<method>[A-Z]*) (?P<url>.*) HTTP\/[\d\.]+" (?P<statuscode>\d{3}) (?P<bytessent>\d+) "(?P<refferer>.*)" "(?P<useragent>.*)"', re.IGNORECASE)
def parse(l):
  data = re.search(lineformat, l)
  if data:
    datadict = data.groupdict()
    # ip = datadict["ipaddress"]
    # date = datadict["date"]

    # bytessent = datadict["bytessent"]
    referrer = datadict["refferer"] if len(datadict["refferer"]) else "-"
    useragent = datadict["useragent"] if len(datadict["useragent"]) else "-"
    scode = int(datadict["statuscode"])
    # method = datadict["method"]

    path = datadict["url"] if len(re.findall(r"\%[0-1]", datadict["url"])) else unquote(datadict["url"])
    
    if scode>=410 or len(path)==0: # Internal error | Parse error
      return None

    dotdot = path.count("..")
    dot = path.count(".")
    slash = path.count("/")
    question = path.count("?")
    exclamation = path.count("!")
    amp = path.count("&")
    equal = path.count("=")
    # pcnt = path.count("%")
    invalid_escape = len(re.findall(r"\%[0-7]", datadict["url"]))
    slash_x = path.count("\\x")
    semicolon = path.count(";")
    space = path.count(" ")
    colon = path.count(":")
    less = path.count("<")
    great = path.count(">")
    pipe = path.count("|")
    plus = path.count("+")
    open_p = path.count("(")
    open_b = path.count("[")
    success = 1 if scode<400 else 0
    has_ref = 1 if referrer != "-" else 0
    path_len = len(path)

    return [path,referrer,useragent,dotdot,dot,slash,question,exclamation,amp,equal,invalid_escape,slash_x,semicolon,space,colon,less,great,pipe,plus,open_p,open_b,scode,success,has_ref,path_len]
  return None

logfile = open("/content/drive/MyDrive/LogInspector/access.log")

arr = []
for l in logfile.readlines():
  d = parse(l)
  if d != None:
    arr.append(d)

print("datalen:",len(arr))
df = pd.DataFrame(data=arr, index=None, columns=["path","referrer","useragent","dotdot","dot","slash","question","exclamation","amp","equal","invalid_escape","slash_x","semicolon","space","colon","less","great","pipe","plus","open_p","open_b","scode","success","has_ref","path_len"])
df.to_csv('/content/drive/MyDrive/LogInspector/log.csv',index=False)

datalen: 1224781


In [None]:
#@title Train with log.csv
import pandas as pd
import pickle
from sklearn.ensemble import IsolationForest

df = pd.read_csv('/content/drive/MyDrive/LogInspector/log.csv')

train=False#@param {type:"boolean"}
localmodel=True#@param {type:"boolean"}
MODELPATH="/content/drive/MyDrive/LogInspector/IFModel.pkl"#@param {type:"string"}

model=None
if localmodel:
  model = pickle.load(open(MODELPATH, 'rb'))
else:
  model = IsolationForest(
      n_estimators=1000,
      max_samples='auto',
      contamination=0.003,
    )

if train:
  model.fit(df.drop(columns=["path","referrer","useragent"]))

df["predict"] = model.predict(df.drop(columns=["path","referrer","useragent"]))

df.loc[(df['scode'] < 400) & (df['predict']==-1), 'predict'] = 1

df = pd.DataFrame(data=df,index=None, columns=["path","referrer","useragent","dotdot","dot","slash","question","exclamation","amp","equal","invalid_escape","slash_x","semicolon","space","colon","less","great","pipe","plus","open_p","open_b","scode","success","has_ref","path_len","predict"])
df.to_csv('/content/drive/MyDrive/LogInspector/log_predicted.csv')

In [None]:
#@title Save model
import pickle
pickle.dump(model, open(MODELPATH, 'wb'))

In [None]:
#@title Print log_predicted.csv
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/LogInspector/log_predicted.csv')

# print(df[df["predict"]==-1])
for index, row in df[df["predict"]==-1].iterrows():
  print(row["path"],row["referrer"],row["useragent"],row["scode"])


In [None]:
#@title Check log_predicted.csv
import pandas as pd

df.loc[df["predict"] < 0, "predict"] = 0
df = df[["path","referrer","useragent","predict"]]
df = df.drop_duplicates()
print(df)



                                   path              referrer  \
0                                     /                     -   
1                          /favicon.ico  http://192.168.1.21/   
3                                     /                     -   
4                                     /                     -   
6                                     /                     -   
...                                 ...                   ...   
1224646   /xmlrpcs.php?daksldlkdsadas=1          www.bing.com   
1224647  /wp-admin.php?daksldlkdsadas=1          www.bing.com   
1224648    /qindex.php?daksldlkdsadas=1          www.bing.com   
1224662                        /css.php          www.bing.com   
1224676                     /xindex.php          www.bing.com   

                                                 useragent  predict  
0        Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...        1  
1        Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...        1  
3        