>* 這份Notebook中會實作一個爬蟲(Cralwer)偵測系統

>  目的是抓出可疑的爬蟲ip，讓我們有能力識別、並進一步分析爬蟲的行為，維護網站流量及服務設計


>* 此系統使用網站每日的IIS Log作為分析依據

>  除了爬蟲本身的特徵外，也建構了一個基於行為(Behavior)的Machine Learning模型，用來輔助偵測






首先，如果是一個善意的爬蟲，他會表明自己的身分，如以下幾個特徵:
- - -
* 存取網站管理者定義的robots.txt檔案，不抓別人不想給的東西
* 特定的User Agent，如GoogleBot、BingBot

- - -

而一般使用者放出去的爬蟲或是惡意的爬蟲，就仰賴進一步的判斷

依據對IIS Log的觀察以及我所survey的paper，我定義了幾種爬蟲常見的行為(Behavior):
- - -

* 大量的HTTP Request
* GET、POST、HEAD的使用比例，特別是HEAD，一般使用者不會用到
* 爬蟲不知道網頁是否失效，所以4XX Error出現次數會多
* 大部分爬蟲傾向不抓圖片，因此html / picture比例高
* 少數狡猾的爬蟲，使用"少量多次"的抓取方式，存在時間長

- - -
下面便是系統的第一步驟，從IIS Log中擷取出這些行為

每個ip所表現的行為將是等下用來建立Learning模型的Feature


In [None]:

from pandas import read_csv
import pandas as pd
import numpy as np
from datetime import timedelta
from datetime import datetime
import os

featureDim = ['hitCount', 'GETcount', 'POSTcount', 'HEADcount', '3xxCount', '4xxCount', 
              'hjRatio']
              
              
def num(s):
    try:
        return int(s)
    except ValueError:
        return 0

def generateFeature(logTable):
    bNum = 0
    features = pd.DataFrame(columns=featureDim)
    relatedIP = []
    trackRange = timedelta(0, 300)
    ips = logTable[9].unique()
    for ip in ips:
        if ip.count('.')==3:      
            ipLogs = logTable[logTable[9]==ip]
            # processing time
            temp = ipLogs[1].as_matrix()
            ipLogs[1] = np.array([datetime.strptime(d, '%H:%M:%S') for d in temp])
            # start behavior tracking
            startTime = ipLogs.iloc[0][1]
            behavior = np.zeros((1, 7))
            hCount = 0
            jCount = 0
            for i in range(0, ipLogs.shape[0]):
                curTime = ipLogs.iloc[i][1]
                if i%500 == 0:
                    print('ip:%s iter:%d\n' %(ip, i))
                
                if curTime - startTime < trackRange:
                    behavior[0][0] += 1
                    behavior[0][1] += (ipLogs.iloc[i][4] == 'GET')
                    behavior[0][2] += (ipLogs.iloc[i][4] == 'POST')
                    behavior[0][3] += (ipLogs.iloc[i][4] == 'HEAD')            
                    behavior[0][4] += (num(ipLogs.iloc[i][14]) in range(300, 400))
                    behavior[0][5] += (num(ipLogs.iloc[i][14]) in range(400, 500))
                    if ipLogs.iloc[i][5][ipLogs.iloc[i][5].rfind('.')+1:] in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:
                        jCount += 1
                    else:
                        hCount += 1
                else:
                    behavior[0][6] = hCount - jCount
                    features.loc[bNum] = behavior[0]
                    relatedIP.append(ip)
                    bNum += 1
                    startTime = curTime
                    hCount = 0
                    jCount = 0
                    behavior = np.zeros((1, 7))
                    behavior[0][0] += 1
                    behavior[0][1] += (ipLogs.iloc[i][4] == 'GET')
                    behavior[0][2] += (ipLogs.iloc[i][4] == 'POST')
                    behavior[0][3] += (ipLogs.iloc[i][4] == 'HEAD')            
                    behavior[0][4] += (num(ipLogs.iloc[i][14]) in range(300, 400))
                    behavior[0][5] += (num(ipLogs.iloc[i][14]) in range(400, 500))
                    if ipLogs.iloc[i][5][ipLogs.iloc[i][5].rfind('.')+1:] in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:
                        jCount += 1
                    else:
                        hCount += 1
    features['IP'] = relatedIP
    return features
    

                        
                            
curDir = os.getcwd()
logDir = 'web'
if not os.path.isdir(curDir+'/features'):
    os.makedirs(curDir+'/features')
    
for i in range(1,4):
    for filename in os.listdir(curDir+'/'+logDir+str(i)):
        filePath = './web'+str(i)+'/'+filename
        temp = []
        f = open(filePath, encoding='utf-8', errors='ignore')
        line = f.readline()
        while line:
            if line.startswith('#'):
                line = f.readline()
                continue
            else:
                temp.append(line.split())
                line = f.readline()
        logTable = pd.DataFrame(temp)
        features = generateFeature(logTable)
        
        writePath = curDir+'/features/'+\
                    logDir+str(i)+\
                    filename[filename.rfind('ex')-1:filename.rfind('.')]+'_f.csv'
        features.to_csv(writePath, index=False)
        
        
        
        
