**Use basic probability to identify anomalous requests. Using the methods covered in this lesson, examine the rest of the features in the api access logs data set.**

In [3]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

import env

In [2]:
from sqlalchemy import create_engine, text

In [5]:
url = env.get_db_url('logs')

In [7]:
api = pd.read_sql('''SELECT * FROM api_access''', url)

api.head()

Unnamed: 0,entry
0,97.105.19.58 - - [16/Apr/2019:19:34:42 +0000] ...
1,97.105.19.58 - - [16/Apr/2019:19:34:42 +0000] ...
2,97.105.19.58 - - [16/Apr/2019:19:34:44 +0000] ...
3,97.105.19.58 - - [16/Apr/2019:19:34:46 +0000] ...
4,97.105.19.58 - - [16/Apr/2019:19:34:48 +0000] ...


In [12]:
api.entry[0]

'97.105.19.58 - - [16/Apr/2019:19:34:42 +0000] "GET /api/v1/sales?page=81 HTTP/1.1" 200 512495 "-" "python-requests/2.21.0"'

In [22]:
parts = api.entry.str.split()

In [23]:
parts[0]

['97.105.19.58',
 '-',
 '-',
 '[16/Apr/2019:19:34:42',
 '+0000]',
 '"GET',
 '/api/v1/sales?page=81',
 'HTTP/1.1"',
 '200',
 '512495',
 '"-"',
 '"python-requests/2.21.0"']

In [24]:
# function to deal with parsing one entry in our log data
def parse_log_entry(entry):
    parts = entry.split()
    output = {}
    output['ip'] = parts[0]
    output['timestamp'] = parts[3][1:].replace(':', ' ', 1)
    output['request_method'] = parts[5][1:]
    output['request_path'] = parts[6]
    output['http_version'] = parts[7][:-1]
    output['status_code'] = parts[8]
    output['size'] = int(parts[9])
    output['user_agent'] = ' '.join(parts[11:]).replace('"', '')
    return pd.Series(output)

In [31]:
api = api.entry.apply(parse_log_entry)

In [32]:
api.head()

Unnamed: 0,ip,timestamp,request_method,request_path,http_version,status_code,size,user_agent
0,97.105.19.58,16/Apr/2019 19:34:42,GET,/api/v1/sales?page=81,HTTP/1.1,200,512495,python-requests/2.21.0
1,97.105.19.58,16/Apr/2019 19:34:42,GET,/api/v1/items,HTTP/1.1,200,3561,python-requests/2.21.0
2,97.105.19.58,16/Apr/2019 19:34:44,GET,/api/v1/sales?page=82,HTTP/1.1,200,510103,python-requests/2.21.0
3,97.105.19.58,16/Apr/2019 19:34:46,GET,/api/v1/sales?page=83,HTTP/1.1,200,510003,python-requests/2.21.0
4,97.105.19.58,16/Apr/2019 19:34:48,GET,/api/v1/sales?page=84,HTTP/1.1,200,511963,python-requests/2.21.0


In [33]:
#Running value_counts on IP addresses to check anomalies

api.ip.value_counts()

97.105.19.58      11998
173.173.113.51     1059
72.181.113.170      613
72.181.105.81       246
24.26.242.9          21
68.201.219.223       21
70.121.214.34         2
52.87.230.102         2
35.175.171.137        2
54.145.52.184         1
3.92.201.136          1
35.174.209.2          1
34.229.70.250         1
54.172.14.223         1
34.207.64.242         1
3.88.129.158          1
52.91.30.150          1
52.90.165.200         1
45.23.250.16          1
Name: ip, dtype: int64

In [36]:
#Will run it on other columns as well just to see if anything stands out

api.user_agent.value_counts()

python-requests/2.21.0                                                                                                       12001
python-requests/2.20.1                                                                                                        1911
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36       34
Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0                                               8
Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)                                                                       7
Slackbot 1.0 (+https://api.slack.com/robots)                                                                                     6
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36        4
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gec

Slackbot might be one. Python-urllib might be another one. 

In [38]:
api.request_path.value_counts()

/api/v1/sales?page=2                 709
/api/v1/items                        464
/api/v1/items?page=2                 291
/api/v1/items?page=3                 219
/api/v1/stores                       162
                                    ... 
/api/v1/items?page=0                   1
/api/v1/stores?page=999                1
/api/v1/stores?page=2                  1
/api/v1/stores?page=666                1
/api/v1/items/api/v1/items?page=3      1
Name: request_path, Length: 218, dtype: int64

It's odd that someone would try and go to page number 999... some of these smaller counts are suspect.

In [39]:
api.http_version.value_counts()

HTTP/1.1    13974
Name: http_version, dtype: int64

Nothing odd here.

In [41]:
api.request_method.value_counts()

GET    13974
Name: request_method, dtype: int64

Nothing odd here.

### Conditional Probabilities

In [43]:
#For this one I'll look at IP address and user_agent.

api.ip.value_counts(normalize=True)

97.105.19.58      0.858595
173.173.113.51    0.075784
72.181.113.170    0.043867
72.181.105.81     0.017604
24.26.242.9       0.001503
68.201.219.223    0.001503
70.121.214.34     0.000143
52.87.230.102     0.000143
35.175.171.137    0.000143
54.145.52.184     0.000072
3.92.201.136      0.000072
35.174.209.2      0.000072
34.229.70.250     0.000072
54.172.14.223     0.000072
34.207.64.242     0.000072
3.88.129.158      0.000072
52.91.30.150      0.000072
52.90.165.200     0.000072
45.23.250.16      0.000072
Name: ip, dtype: float64

In [51]:
user_given_ip = api.groupby('ip').user_agent.value_counts(normalize=True).rename('prob_user_given_ip').reset_index()

In [52]:
user_given_ip

Unnamed: 0,ip,user_agent,prob_user_given_ip
0,173.173.113.51,python-requests/2.21.0,1.0
1,24.26.242.9,python-requests/2.21.0,1.0
2,3.88.129.158,Slackbot-LinkExpanding 1.0 (+https://api.slack...,1.0
3,3.92.201.136,Slackbot-LinkExpanding 1.0 (+https://api.slack...,1.0
4,34.207.64.242,Slackbot 1.0 (+https://api.slack.com/robots),1.0
5,34.229.70.250,Slackbot 1.0 (+https://api.slack.com/robots),1.0
6,35.174.209.2,Slackbot 1.0 (+https://api.slack.com/robots),1.0
7,35.175.171.137,Slackbot-LinkExpanding 1.0 (+https://api.slack...,1.0
8,45.23.250.16,python-requests/2.21.0,1.0
9,52.87.230.102,Slackbot 1.0 (+https://api.slack.com/robots),0.5


In [55]:
#Removing all values less than 100%

user_given_ip[user_given_ip.prob_user_given_ip < 1].sort_values('prob_user_given_ip')

Unnamed: 0,ip,user_agent,prob_user_given_ip
26,97.105.19.58,Python-urllib/3.7,8.3e-05
25,97.105.19.58,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3...,0.000167
24,97.105.19.58,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4...,0.002167
21,72.181.113.170,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3...,0.003263
18,72.181.105.81,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; ...,0.03252
19,72.181.105.81,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4...,0.03252
23,97.105.19.58,python-requests/2.20.1,0.159277
9,52.87.230.102,Slackbot 1.0 (+https://api.slack.com/robots),0.5
10,52.87.230.102,Slackbot-LinkExpanding 1.0 (+https://api.slack...,0.5
22,97.105.19.58,python-requests/2.21.0,0.838306


In [58]:
#The most common IP address did pop up as rare for some given names... gonna look into that.

api[(api.ip == '97.105.19.58') & (api.user_agent == 'Python-urllib/3.7')]

Unnamed: 0,ip,timestamp,request_method,request_path,http_version,status_code,size,user_agent
6693,97.105.19.58,16/Apr/2019 20:57:38,GET,/api/v1/items&page=0,HTTP/1.1,200,162,Python-urllib/3.7
