# Exploratory Data Analysis

## Import libraries

In [1]:
import pandas as pd
import openpyxl
from matplotlib import pyplot as plt
import spacy
import seaborn as sns
import altair as alt
alt.renderers.enable('mimetype')

RendererRegistry.enable('mimetype')

## Load both Teejlab and country data

In [2]:
# Teejlab data
api_df = pd.read_excel("../data/raw/RiskClassification_Data_Endpoints_V2.xlsx", "Core_Endpoint",
                      usecols = "A:R")
api_df.head()

Unnamed: 0,api_endpoint_id,api_id,api_vendor_id,api,request_id,method,category,parameters,usage_base,sample_response,tagset,authentication,security_test_category,security_test_result (FALSE=Passed; TRUE=Failed),server_location,hosting_isp,server_name,response_metadata
0,2513,1117,411,Tenor API,7629,get,News & Media,"{""q"": ""Running""}",free,"{\n ""weburl"": ""https://tenor.com/search/runni...",,,,,,,,
1,2578,1148,440,ANZ Products,8698,get,Finance & Banking,{},free,"{""data"":{""products"":[{""additionalInformation"":...","links,eligibilityUri,brand,next,overviewUri,da...",header,,,Singapore,Incapsula Inc,istio-envoy,"{""via"": ""kong/0.36-2-enterprise-edition"", ""x-v..."
2,2575,1147,439,NAB Open APIs,8542,get,Finance & Banking,"{""v"": ""1""}",free,<HTML><HEAD>\n<TITLE>Access Denied</TITLE>\n</...,,header,,,United States,"Akamai Technologies, Inc.",AkamaiGHost,"{""Date"": ""Wed, 18 Mar 2020 07:27:41 GMT"", ""Ser..."
3,2516,1119,413,Translate Text,7733,get,AI & Data Science,"{""lang"": ""en-zh"", ""text"": ""GNE is a good schoo...",free,"{""code"":200,""lang"":""en-zh"",""text"":[""ç½‘å…³ç½‘å...","lang,code,text,text,text,text,text,text",query,,,Russia,Yandex enterprise network,nginx/1.6.2,"{""Date"": ""Mon, 23 Dec 2019 23:10:35 GMT"", ""Ser..."
4,2416,1050,365,Google Custom Search,24061,get,Software & Services,"{""q"": ""Dehri, Bihar, India""}",free,"{\n ""kind"": ""customsearch#search"",\n ""url"": ...","template,snippet,safe,type,url,items,cacheId,h...",query,,,United States,Google LLC,ESF,"{""Date"": ""Thu, 07 Oct 2021 19:14:31 GMT"", ""Var..."


In [3]:
api_df.dtypes

api_endpoint_id                                       int64
api_id                                                int64
api_vendor_id                                         int64
api                                                  object
request_id                                            int64
method                                               object
category                                             object
parameters                                           object
usage_base                                           object
sample_response                                      object
tagset                                               object
authentication                                       object
security_test_category                               object
security_test_result (FALSE=Passed; TRUE=Failed)    float64
server_location                                      object
hosting_isp                                          object
server_name                             

In [4]:
country_metric_df = pd.read_excel("../data/raw/nri_2021_dataset.xlsx", "NRI 2021 - results",
                      usecols = "B:C", skiprows=1)
                      
country_metric_df.head()

Unnamed: 0,Country,NRI score
0,Netherlands,82.061638
1,Sweden,81.567929
2,Denmark,81.235302
3,United States,81.08982
4,Finland,80.473263


In [5]:
country_metric_df.dtypes

Country       object
NRI score    float64
dtype: object

## Combine df

In [6]:
# api_df_trial = pd.merge([api_df,country_metric_df])

# api_df.join(country_metric_df, how = "left", on='server_location')
api_df_trial = api_df.merge(country_metric_df, left_on = 'server_location', right_on = 'Country')
api_df_trial


Unnamed: 0,api_endpoint_id,api_id,api_vendor_id,api,request_id,method,category,parameters,usage_base,sample_response,tagset,authentication,security_test_category,security_test_result (FALSE=Passed; TRUE=Failed),server_location,hosting_isp,server_name,response_metadata,Country,NRI score
0,2578,1148,440,ANZ Products,8698,get,Finance & Banking,{},free,"{""data"":{""products"":[{""additionalInformation"":...","links,eligibilityUri,brand,next,overviewUri,da...",header,,,Singapore,Incapsula Inc,istio-envoy,"{""via"": ""kong/0.36-2-enterprise-edition"", ""x-v...",Singapore,80.014421
1,2515,1118,412,Text Analysis API,7726,get,AI & Data Science,{},free,"[{""id"":""ar"",""name"":""Ø§Ù„Ø¹Ø±Ø¨ÙŠØ©"",""englishNa...","englishName,nativeEncoding,name,id,latin,right...",header,,,Singapore,Microsoft Corporation,Unavailable/Obscured,"{""Date"": ""Mon, 23 Dec 2019 21:22:57 GMT"", ""Con...",Singapore,80.014421
2,2514,1118,412,Text Analysis API,7725,post,AI & Data Science,{},free,"{""text"":""GNE is a good school, but talent nurt...","severity,length,type,offset,sentence_index,tex...",header,,,Singapore,Microsoft Corporation,Unavailable/Obscured,"{""Date"": ""Mon, 23 Dec 2019 21:22:19 GMT"", ""Con...",Singapore,80.014421
3,2579,1148,440,ANZ Products,8554,get,Finance & Banking,"{""productId"": ""7c277163-1b71-d314-0f1f-b7e713b...",free,"{""data"":{""lastUpdated"":""2019-08-22T22:57:35.61...","isTailored,name,lendingRates,additionalValue,d...",header,,,Singapore,Incapsula Inc,istio-envoy,"{""x-v"": ""1"", ""Date"": ""Wed, 18 Mar 2020 23:03:3...",Singapore,80.014421
4,2575,1147,439,NAB Open APIs,8542,get,Finance & Banking,"{""v"": ""1""}",free,<HTML><HEAD>\n<TITLE>Access Denied</TITLE>\n</...,,header,,,United States,"Akamai Technologies, Inc.",AkamaiGHost,"{""Date"": ""Wed, 18 Mar 2020 07:27:41 GMT"", ""Ser...",United States,81.089820
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,2699,1220,489,currencylayer API,26807,get,Finance & Banking,"{""source"": ""USD""}",free,"{""success"":false,""error"":{""code"":105,""info"":""A...","success,info,error,code,success,code,code,code...",query,,,Spain,"Cloudflare, Inc.",cloudflare,"{""NEL"": ""{\""success_fraction\"":0,\""report_to\""...",Spain,69.942859
99,2730,1237,505,test api 2,19977,get,Health Science & Medicine,{},free,"{\n ""userId"": 1,\n ""id"": 1,\n ""title"": ""del...","id,completed,title,userId,userId,id,id,id,id,i...",none,SQL Injection,1.0,Spain,"Cloudflare, Inc.",cloudflare,"{""Age"": ""23190"", ""NEL"": ""{\""report_to\"":\""cf-n...",Spain,69.942859
100,2796,1239,505,iAPI,22997,get,Business & Technology,"{""match_id"": ""271145471""}",free,"{""match_id"":271145471,""barracks_status_dire"":6...","leagueid,firstblood_claimed,leaver_status,engi...",header,,,Spain,"Cloudflare, Inc.",cloudflare,"{""NEL"": ""{\""report_to\"":\""cf-nel\"",\""max_age\""...",Spain,69.942859
101,2795,1256,542,API for security test Adding random values to ...,23268,get,Security & Technology,"{""match_id"": ""271145471""}",free,"{""error"":""rate limit exceeded""}","item_neutral,leagueid,gold_t,sen,assists,backp...",OAuth2,Broken Authentication,0.0,Spain,"Cloudflare, Inc.",cloudflare,"{""NEL"": ""{\""report_to\"":\""cf-nel\"",\""max_age\""...",Spain,69.942859


In [7]:
# Sanity Check to see if there are columns that do not have NRI score
api_df_trial[api_df_trial['NRI score'].isnull()]

Unnamed: 0,api_endpoint_id,api_id,api_vendor_id,api,request_id,method,category,parameters,usage_base,sample_response,tagset,authentication,security_test_category,security_test_result (FALSE=Passed; TRUE=Failed),server_location,hosting_isp,server_name,response_metadata,Country,NRI score
