In [1]:
# Import dependencies
import pandas as pd
import openpyxl
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load data from excel
api_df = pd.read_excel("../data/raw/RiskClassification_Data_Endpoints_V1.xlsx", 
                       "Core_Endpoint",
                       usecols="A:R")
api_df.head()

Unnamed: 0,api_endpoint_id,api_id,api_vendor_id,api_vendor,api,category,usage_base,sample_response,tagset,authentication,security_test_category,security_test_result (FALSE=Passed; TRUE=Failed),server_location,hosting_isp,server_name,response_metadata,hosting city,risk_label_Baljeet
0,2410,1045,361,OneLook,Datamuse Dictionary,Research & Education,free,,"tags,score,word,word,word",none,Broken Authentication,0.0,United States,"Amazon.com, Inc.",Apache/2.4.33 (Amazon) mod_wsgi/3.5 Python/2.7...,"{""Date"": ""Fri, 08 Nov 2019 14:43:17 GMT"", ""Var...",Ashburn,Low
1,2410,1045,361,OneLook,Datamuse Dictionary,Research & Education,free,,"tags,score,word,word,word",none,Broken Authentication,0.0,United States,"Amazon.com, Inc.",Unavailable/Obscured,"{""Date"": ""Fri, 08 Nov 2019 14:43:17 GMT"", ""Var...",Ashburn,No
2,2410,1045,361,OneLook,Datamuse Dictionary,Research & Education,free,,"tags,score,word,word,word",none,Cross-Site Scripting,0.0,United States,"Amazon.com, Inc.",Apache/2.4.33 (Amazon) mod_wsgi/3.5 Python/2.7...,"{""Date"": ""Fri, 08 Nov 2019 14:43:18 GMT"", ""Var...",Ashburn,Low
3,2410,1045,361,OneLook,Datamuse Dictionary,Research & Education,free,,"tags,score,word,word,word",none,Cross-Site Scripting,0.0,United States,"Amazon.com, Inc.",Apache/2.4.33 (Amazon) mod_wsgi/3.5 Python/2.7...,,Ashburn,Low
4,2410,1045,361,OneLook,Datamuse Dictionary,Research & Education,free,,"tags,score,word,word,word",none,Cross-Site Scripting,0.0,United States,"Amazon.com, Inc.",Unavailable/Obscured,"{""Date"": ""Fri, 08 Nov 2019 14:43:18 GMT"", ""Var...",Ashburn,No


In [3]:
# Selecting relevant columns
security_test_df = api_df[["api_endpoint_id", "security_test_category", "security_test_result (FALSE=Passed; TRUE=Failed)"]]
security_test_df = security_test_df.rename(columns = {"api_endpoint_id":"endpoint_id",
                                   "security_test_category":"test_category",
                                    "security_test_result (FALSE=Passed; TRUE=Failed)":"test_result"})
security_test_df.head()

Unnamed: 0,endpoint_id,test_category,test_result
0,2410,Broken Authentication,0.0
1,2410,Broken Authentication,0.0
2,2410,Cross-Site Scripting,0.0
3,2410,Cross-Site Scripting,0.0
4,2410,Cross-Site Scripting,0.0


In [4]:
# List of unique test categories
list(security_test_df["test_category"].unique())

['Broken Authentication',
 'Cross-Site Scripting',
 'Insecure Deserialization',
 'SQL Injection',
 'XML External Entities',
 'Buffer Overflow',
 nan]

In [5]:
# Check presence of nan values
security_test_df[security_test_df["test_category"].isnull()]

Unnamed: 0,endpoint_id,test_category,test_result
271,2677,,
295,2687,,
296,2687,,
297,2687,,
298,2687,,
...,...,...,...
416,2687,,
417,2687,,
418,2687,,
419,2687,,


In [6]:
# Count number of records for each endpoint, including NaN values
security_test_df.groupby('endpoint_id').size()

endpoint_id
2410     16
2524      1
2532      1
2544     20
2545     60
2546      9
2548      2
2583      4
2585     34
2600     16
2601     13
2612     48
2628     30
2677     19
2681     22
2687    175
2699     10
2702    158
2730    229
2753      2
2754    284
2761      1
2795      5
2796      2
2812    574
2823    265
dtype: int64

In [7]:
# Cound number of records for each endpoint, excluding NaN values
feat_security_df = security_test_df.groupby('endpoint_id').count()
feat_security_df

Unnamed: 0_level_0,test_category,test_result
endpoint_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2410,16,16
2524,1,1
2532,1,1
2544,20,20
2545,60,60
2546,9,9
2548,2,2
2583,4,4
2585,34,34
2600,16,16


In [8]:
# Adding rows with no record of security tests
#security_count_df["api_id"] = security_count_df.index
#feat_security_test = pd.merge(feat_security_test, security_count_df, on="api_id", how="outer")

In [9]:
# Adding feature "security_test_conducted": 1 for tests-conducted, 0 for tests-not-conducted
feat_security_df["security_test_conducted"] = 0
feat_security_df.loc[feat_security_df["test_category"] > 0 , "security_test_conducted"] = 1
feat_security_df.drop(columns=["test_category", "test_result"], inplace=True)
feat_security_df.head()

Unnamed: 0_level_0,security_test_conducted
endpoint_id,Unnamed: 1_level_1
2410,1
2524,1
2532,1
2544,1
2545,1


In [10]:
# Adding features for each Security category "test_result":
# 1 for all tests FAIL, 0 for all tests PASS
stacked_df = security_test_df.groupby(["endpoint_id", "test_category"]).mean()
stacked_df

Unnamed: 0_level_0,Unnamed: 1_level_0,test_result
endpoint_id,test_category,Unnamed: 2_level_1
2410,Broken Authentication,0.0
2410,Cross-Site Scripting,0.0
2410,Insecure Deserialization,0.0
2410,SQL Injection,0.0
2410,XML External Entities,0.0
...,...,...
2796,Broken Authentication,1.0
2812,Insecure Deserialization,0.0
2812,SQL Injection,1.0
2823,Buffer Overflow,0.0


In [11]:
stacked_df.unstack()

Unnamed: 0_level_0,test_result,test_result,test_result,test_result,test_result,test_result
test_category,Broken Authentication,Buffer Overflow,Cross-Site Scripting,Insecure Deserialization,SQL Injection,XML External Entities
endpoint_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2410,0.0,,0.0,0.0,0.0,0.0
2524,0.0,,,,,
2532,1.0,,,,,
2544,,0.0,,,0.285714,
2545,0.0,,,,0.0,
2546,,,,,0.0,
2548,,,,,0.0,
2583,,,,0.0,,
2585,,,0.0,,0.533333,
2600,0.0,,0.0,0.0,0.0,0.0


In [12]:
# Combining Security_test_conducted and test_result features
feat_security_df = pd.merge(feat_security_df,
                            stacked_df.unstack(),
                            on="endpoint_id",
                            how="outer")
feat_security_df

Unnamed: 0_level_0,security_test_conducted,"(test_result, Broken Authentication)","(test_result, Buffer Overflow)","(test_result, Cross-Site Scripting)","(test_result, Insecure Deserialization)","(test_result, SQL Injection)","(test_result, XML External Entities)"
endpoint_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2410,1,0.0,,0.0,0.0,0.0,0.0
2524,1,0.0,,,,,
2532,1,1.0,,,,,
2544,1,,0.0,,,0.285714,
2545,1,0.0,,,,0.0,
2546,1,,,,,0.0,
2548,1,,,,,0.0,
2583,1,,,,0.0,,
2585,1,,,0.0,,0.533333,
2600,1,0.0,,0.0,0.0,0.0,0.0


In [13]:
# Imputing missing values: if a test result is not available, fill '-1' as default
feat_security_df.fillna(-1, inplace=True)
feat_security_df.round(decimals=1)

Unnamed: 0_level_0,security_test_conducted,"(test_result, Broken Authentication)","(test_result, Buffer Overflow)","(test_result, Cross-Site Scripting)","(test_result, Insecure Deserialization)","(test_result, SQL Injection)","(test_result, XML External Entities)"
endpoint_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2410,1,0.0,-1.0,0.0,0.0,0.0,0.0
2524,1,0.0,-1.0,-1.0,-1.0,-1.0,-1.0
2532,1,1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2544,1,-1.0,0.0,-1.0,-1.0,0.3,-1.0
2545,1,0.0,-1.0,-1.0,-1.0,0.0,-1.0
2546,1,-1.0,-1.0,-1.0,-1.0,0.0,-1.0
2548,1,-1.0,-1.0,-1.0,-1.0,0.0,-1.0
2583,1,-1.0,-1.0,-1.0,0.0,-1.0,-1.0
2585,1,-1.0,-1.0,0.0,-1.0,0.5,-1.0
2600,1,0.0,-1.0,0.0,0.0,0.0,0.0


In [14]:
# Final Security Test features
feat_security_df.round(decimals=1)

Unnamed: 0_level_0,security_test_conducted,"(test_result, Broken Authentication)","(test_result, Buffer Overflow)","(test_result, Cross-Site Scripting)","(test_result, Insecure Deserialization)","(test_result, SQL Injection)","(test_result, XML External Entities)"
endpoint_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2410,1,0.0,-1.0,0.0,0.0,0.0,0.0
2524,1,0.0,-1.0,-1.0,-1.0,-1.0,-1.0
2532,1,1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2544,1,-1.0,0.0,-1.0,-1.0,0.3,-1.0
2545,1,0.0,-1.0,-1.0,-1.0,0.0,-1.0
2546,1,-1.0,-1.0,-1.0,-1.0,0.0,-1.0
2548,1,-1.0,-1.0,-1.0,-1.0,0.0,-1.0
2583,1,-1.0,-1.0,-1.0,0.0,-1.0,-1.0
2585,1,-1.0,-1.0,0.0,-1.0,0.5,-1.0
2600,1,0.0,-1.0,0.0,0.0,0.0,0.0
