In [1]:
import pandas as pd
import re
import os
import time
import gc
from tqdm import tqdm

In [2]:
data_arr = []
errors_file = 'errors123.txt'
with open('../input/web-server-access-logs/access.log') as log:
    for line in log.readlines():
        match = re.match(r'^(?P<client>\S+) \S+ (?P<userid>\S+) \[(?P<datetime>[^\]]+)\] "(?P<method>[A-Z]+) (?P<request>[^ "]+)? HTTP/[0-9.]+" (?P<status>[0-9]{3}) (?P<size>[0-9]+|-) "(?P<referrer>[^"]*)" "(?P<useragent>[^"]*)', line)
        if match:
            data_arr.append([match.group(1), match.group(3), match.group(4), match.group(5), match.group(6), match.group(7), match.group(8), match.group(9)])
        else:
            with open(errors_file, 'at') as errfile:
                print((line), file=errfile)
df = pd.DataFrame(data_arr, columns=['ip_address', 'datetime', 'request_type', 'request', 'status', 'size', 'referer', 'user_agent'])

In [19]:
df['datetime'] = pd.to_datetime(df['datetime'], format='%d/%b/%Y:%H:%M:%S %z')
df['status'] = df['status'].astype('int16')
df['size'] = df['size'].astype('int32')

In [20]:
df

Unnamed: 0,ip_address,datetime,request_type,request,status,size,referer,user_agent,GoogleBot,BingBot
0,54.36.149.41,2019-01-22 03:56:14+03:30,GET,/filter/27|13%20%D9%85%DA%AF%D8%A7%D9%BE%DB%8C...,200,30577,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...,0,0
1,31.56.96.51,2019-01-22 03:56:16+03:30,GET,/image/60844/productModel/200x200,200,5667,https://www.zanbil.ir/m/filter/b113,Mozilla/5.0 (Linux; Android 6.0; ALE-L21 Build...,0,0
2,31.56.96.51,2019-01-22 03:56:16+03:30,GET,/image/61474/productModel/200x200,200,5379,https://www.zanbil.ir/m/filter/b113,Mozilla/5.0 (Linux; Android 6.0; ALE-L21 Build...,0,0
3,40.77.167.129,2019-01-22 03:56:17+03:30,GET,/image/14925/productModel/100x100,200,1696,-,Mozilla/5.0 (compatible; bingbot/2.0; +http://...,0,1
4,91.99.72.15,2019-01-22 03:56:17+03:30,GET,/product/31893/62100/%D8%B3%D8%B4%D9%88%D8%A7%...,200,41483,-,Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16...,0,0
...,...,...,...,...,...,...,...,...,...,...
10364861,5.127.220.71,2019-01-26 20:29:13+03:30,GET,/apple-touch-icon-120x120.png,404,32420,-,MobileSafari/604.1 CFNetwork/976 Darwin/18.2.0,0,0
10364856,5.113.60.62,2019-01-26 20:29:13+03:30,GET,/static/images/amp/instagram.png,200,7146,https://www.zanbil.ir/m/article/616/%D8%B9%D9%...,Mozilla/5.0 (Linux; Android 8.0.0; SAMSUNG SM-...,0,0
10364854,192.15.51.231,2019-01-26 20:29:13+03:30,GET,/image/184/productModel/150x150,200,3449,https://www.zanbil.ir/m/product/4125/205/%D8%B...,Mozilla/5.0 (Linux; Android 4.4.4; SM-J110H Bu...,0,0
10364855,109.125.169.52,2019-01-26 20:29:13+03:30,GET,/,200,29950,http://ptcnovin.com/viewads/4B534D5O05649374H5...,Mozilla/5.0 (Windows NT 6.1; rv:64.0) Gecko/20...,0,0


In [21]:
df['GoogleBot'] = df['user_agent'].apply(lambda x: 1 if str(x).find("Googlebot")>=0 else 0)
df['BingBot'] = df['user_agent'].apply(lambda x: 1 if str(x).find("bingbot")>=0 else 0)

In [22]:
df=df.sort_values(by=['datetime'])

In [23]:
from pathlib import Path
import plotly.graph_objs as go

In [25]:
bdf = df['datetime'][df['BingBot']==1]
bdf = bdf.apply(lambda x: str(x).split(':')[0])
bdf_final = pd.DataFrame(
    {'datetime': list(bdf.value_counts(sort=False).index),
     'frequency': bdf.value_counts(sort=False).values
    })

In [26]:
Gdf = df['datetime'][df['GoogleBot']==1]
Gdf = Gdf.apply(lambda x: str(x).split(':')[0])
Gdf_final = pd.DataFrame(
    {'datetime': list(Gdf.value_counts(sort=False).index),
     'frequency': Gdf.value_counts(sort=False).values
    })

In [27]:
from sklearn import preprocessing
bdf_f=preprocessing.normalize([list(bdf_final['frequency'])])
bdf_final["frequency"]=bdf_f[0]
Gdf_f=preprocessing.normalize([list(Gdf_final['frequency'])])
Gdf_final["frequency"]=Gdf_f[0]

In [28]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = bdf_final['datetime'], y = bdf_final['frequency'], name = 'Bingbot frequency'+'        ', line = dict(color = 'firebrick', width = 2)))
fig.add_trace(go.Scatter(x = Gdf_final['datetime'], y = Gdf_final['frequency'], name = 'Googlebot frequency'+'        ', line = dict(color = 'royalblue', width = 2)))
fig.update_layout(font_family = 'Times New Roman, monospace, 100', font_color = 'black', plot_bgcolor = '#dbfdff', xaxis_title = 'Time', title = "Time(in Hrs) VS Frequency" )
fig.show()

In [33]:
# run this from a normal command line
import spacy

spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

ConnectionError: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /explosion/spacy-models/master/compatibility.json (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x75b5d0900810>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))

In [30]:
import spacy
from spacy.tokens import Doc
nlp = spacy.load("en_core_web_md")

OSError: [E050] Can't find model 'en_core_web_md'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
%time df['tokenized'] = df['request'][:103648].apply(nlp)

In [None]:
df['sent_vectors'] = df['tokenized'][:103648].apply(lambda x: x.vector)


In [None]:
vectors = df['sent_vectors'][:103648].apply(pd.Series).to_numpy()

In [None]:
vectors.shape

In [None]:
#savingn the array
import numpy as np
with open('vector.npy', 'wb') as f:
    np.save(f, vectors)

In [None]:
#clustering
from sklearn.cluster import KMeans
k = 18
kmeans = KMeans(k)
kmeans.fit(vectors)
tret=kmeans.predict(vectors)
with open('tret.npy', 'wb') as f:
    np.save(f, tret)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

Sum_of_squared_distances = []
K = range(1,20)
for num_clusters in K :
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(vectors)
    Sum_of_squared_distances.append(kmeans.inertia_)
plt.plot(K,Sum_of_squared_distances,'bx-')
plt.xlabel('Values of K') 
plt.ylabel('‘Sum of squared distances/Inertia’') 
plt.title('‘Elbow Method For Optimal k’')
plt.show()

In [None]:
silhouette_avg = []
K = range(2,20,2)
for num_clusters in K :
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(vectors)
    cluster_labels = kmeans.labels_
#     print(cluster_labels)
    silhouette_avg.append(silhouette_score(vectors, cluster_labels))
plt.plot(K,silhouette_avg,'bx-')
plt.xlabel('Values of K') 
plt.ylabel('silhouette_avg') 
plt.title('silhouette_avg Method For Optimal k')
plt.show()