In [1]:
# Import dependencies
import pandas as pd
import openpyxl
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load data from excel
api_df = pd.read_excel("../data/raw/RiskClassification_Data_Endpoints_V2.xlsx", 
                       "Core_Endpoint",
                       usecols="A:R")
api_df.head()

Unnamed: 0,api_endpoint_id,api_id,api_vendor_id,api,request_id,method,category,parameters,usage_base,sample_response,tagset,authentication,security_test_category,security_test_result (FALSE=Passed; TRUE=Failed),server_location,hosting_isp,server_name,response_metadata
0,2052,762,276,Natural Language Processing,10789,post,AI & Data Science,"{""Content-Type"": ""application/json"", ""textToDe...",commercial,"{""Successful"":true,""DetectedLanguage_ThreeLett...","DetectedLanguage_FullName,DetectedLanguage_Thr...",header,,,United States,OVH Hosting,Cloudmersive Server,"{""Date"": ""Thu, 13 May 2021 23:23:26 GMT"", ""Pra..."
1,2513,1117,411,Tenor API,7629,get,News & Media,"{""q"": ""Running""}",free,"{\n ""weburl"": ""https://tenor.com/search/runni...",,,,,,,,
2,2578,1148,440,ANZ Products,8698,get,Finance & Banking,{},free,"{""data"":{""products"":[{""additionalInformation"":...","links,eligibilityUri,brand,next,overviewUri,da...",header,,,Singapore,Incapsula Inc,istio-envoy,"{""via"": ""kong/0.36-2-enterprise-edition"", ""x-v..."
3,2575,1147,439,NAB Open APIs,8542,get,Finance & Banking,"{""v"": ""1""}",free,<HTML><HEAD>\n<TITLE>Access Denied</TITLE>\n</...,,header,,,United States,"Akamai Technologies, Inc.",AkamaiGHost,"{""Date"": ""Wed, 18 Mar 2020 07:27:41 GMT"", ""Ser..."
4,2516,1119,413,Translate Text,7733,get,AI & Data Science,"{""lang"": ""en-zh"", ""text"": ""GNE is a good schoo...",free,"{""code"":200,""lang"":""en-zh"",""text"":[""ç½‘å…³ç½‘å...","lang,code,text,text,text,text,text,text",query,,,Russia,Yandex enterprise network,nginx/1.6.2,"{""Date"": ""Mon, 23 Dec 2019 23:10:35 GMT"", ""Ser..."


In [18]:
api_df["api_endpoint_id"].nunique()

113

In [3]:
# Selecting relevant columns
security_test_df = api_df[["api_endpoint_id", "security_test_category", "security_test_result (FALSE=Passed; TRUE=Failed)"]]
security_test_df = security_test_df.rename(columns = {"api_endpoint_id":"endpoint_id",
                                   "security_test_category":"test_category",
                                    "security_test_result (FALSE=Passed; TRUE=Failed)":"test_result"})
security_test_df.head()

Unnamed: 0,endpoint_id,test_category,test_result
0,2052,,
1,2513,,
2,2578,,
3,2575,,
4,2516,,


In [4]:
# List of unique test categories
list(security_test_df["test_category"].unique())

[nan,
 'Broken Authentication',
 'Buffer Overflow',
 'Insecure Deserialization',
 'SQL Injection',
 'Cross-Site Scripting',
 'XML External Entities']

In [5]:
# Check presence of nan values
security_test_df[security_test_df["test_category"].isnull()]

Unnamed: 0,endpoint_id,test_category,test_result
0,2052,,
1,2513,,
2,2578,,
3,2575,,
4,2516,,
...,...,...,...
105,2702,,
106,2821,,
107,2743,,
110,2837,,


In [6]:
# Count number of records for each endpoint, including NaN values
security_test_df.groupby('endpoint_id').size()

endpoint_id
2052    1
2410    1
2416    1
2512    1
2513    1
       ..
2830    1
2834    1
2836    1
2837    1
2838    1
Length: 113, dtype: int64

In [7]:
# Cound number of records for each endpoint, excluding NaN values
feat_security_df = security_test_df.groupby('endpoint_id').count()
feat_security_df

Unnamed: 0_level_0,test_category,test_result
endpoint_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2052,0,0
2410,1,1
2416,0,0
2512,0,0
2513,0,0
...,...,...
2830,1,1
2834,0,0
2836,1,1
2837,0,0


In [8]:
# Adding rows with no record of security tests
#security_count_df["api_id"] = security_count_df.index
#feat_security_test = pd.merge(feat_security_test, security_count_df, on="api_id", how="outer")

In [9]:
# Adding feature "security_test_conducted": 1 for tests-conducted, 0 for tests-not-conducted
feat_security_df["security_test_conducted"] = 0
feat_security_df.loc[feat_security_df["test_category"] > 0 , "security_test_conducted"] = 1
feat_security_df.drop(columns=["test_category", "test_result"], inplace=True)
feat_security_df.head()

Unnamed: 0_level_0,security_test_conducted
endpoint_id,Unnamed: 1_level_1
2052,0
2410,1
2416,0
2512,0
2513,0


In [10]:
# Adding features for each Security category "test_result":
# 1 for all tests FAIL, 0 for all tests PASS
stacked_df = security_test_df.groupby(["endpoint_id", "test_category"]).mean()
stacked_df

Unnamed: 0_level_0,Unnamed: 1_level_0,test_result
endpoint_id,test_category,Unnamed: 2_level_1
2410,Cross-Site Scripting,0.0
2524,Broken Authentication,0.0
2544,Buffer Overflow,0.0
2583,Insecure Deserialization,0.0
2600,SQL Injection,1.0
2628,Cross-Site Scripting,1.0
2677,Buffer Overflow,0.0
2681,Broken Authentication,0.0
2730,SQL Injection,1.0
2754,Insecure Deserialization,0.0


In [11]:
stacked_df.unstack()

Unnamed: 0_level_0,test_result,test_result,test_result,test_result,test_result,test_result
test_category,Broken Authentication,Buffer Overflow,Cross-Site Scripting,Insecure Deserialization,SQL Injection,XML External Entities
endpoint_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2410,,,0.0,,,
2524,0.0,,,,,
2544,,0.0,,,,
2583,,,,0.0,,
2600,,,,,1.0,
2628,,,1.0,,,
2677,,0.0,,,,
2681,0.0,,,,,
2730,,,,,1.0,
2754,,,,0.0,,


In [12]:
# Combining Security_test_conducted and test_result features
feat_security_df = pd.merge(feat_security_df,
                            stacked_df.unstack(),
                            on="endpoint_id",
                            how="outer")
feat_security_df

Unnamed: 0_level_0,security_test_conducted,"(test_result, Broken Authentication)","(test_result, Buffer Overflow)","(test_result, Cross-Site Scripting)","(test_result, Insecure Deserialization)","(test_result, SQL Injection)","(test_result, XML External Entities)"
endpoint_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2052,0,,,,,,
2410,1,,,0.0,,,
2416,0,,,,,,
2512,0,,,,,,
2513,0,,,,,,
...,...,...,...,...,...,...,...
2830,1,,0.0,,,,
2834,0,,,,,,
2836,1,,,,,,0.0
2837,0,,,,,,


In [16]:
# Imputing missing values: if a test result is not available, fill '-1' as default
feat_security_df.fillna(-1, inplace=True)
feat_security_df = feat_security_df.round(decimals=1)

In [17]:
# Final Security Test features
feat_security_df

Unnamed: 0_level_0,security_test_conducted,"(test_result, Broken Authentication)","(test_result, Buffer Overflow)","(test_result, Cross-Site Scripting)","(test_result, Insecure Deserialization)","(test_result, SQL Injection)","(test_result, XML External Entities)"
endpoint_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2052,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2410,1,-1.0,-1.0,0.0,-1.0,-1.0,-1.0
2416,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2512,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2513,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...
2830,1,-1.0,0.0,-1.0,-1.0,-1.0,-1.0
2834,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2836,1,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
2837,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
