In [1]:
# Import dependencies
import pandas as pd
import openpyxl
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load data from excel
api_df = pd.read_excel("../data/raw/RiskClassification_Data_Endpoints_V2.xlsx", 
                       "Core_Endpoint",
                       usecols="A:R")
api_df.head()

Unnamed: 0,api_endpoint_id,api_id,api_vendor_id,api,request_id,method,category,parameters,usage_base,sample_response,tagset,authentication,security_test_category,security_test_result (FALSE=Passed; TRUE=Failed),server_location,hosting_isp,server_name,response_metadata
0,2513,1117,411,Tenor API,7629,get,News & Media,"{""q"": ""Running""}",free,"{\n ""weburl"": ""https://tenor.com/search/runni...",,,,,,,,
1,2578,1148,440,ANZ Products,8698,get,Finance & Banking,{},free,"{""data"":{""products"":[{""additionalInformation"":...","links,eligibilityUri,brand,next,overviewUri,da...",header,,,Singapore,Incapsula Inc,istio-envoy,"{""via"": ""kong/0.36-2-enterprise-edition"", ""x-v..."
2,2575,1147,439,NAB Open APIs,8542,get,Finance & Banking,"{""v"": ""1""}",free,<HTML><HEAD>\n<TITLE>Access Denied</TITLE>\n</...,,header,,,United States,"Akamai Technologies, Inc.",AkamaiGHost,"{""Date"": ""Wed, 18 Mar 2020 07:27:41 GMT"", ""Ser..."
3,2516,1119,413,Translate Text,7733,get,AI & Data Science,"{""lang"": ""en-zh"", ""text"": ""GNE is a good schoo...",free,"{""code"":200,""lang"":""en-zh"",""text"":[""ç½‘å…³ç½‘å...","lang,code,text,text,text,text,text,text",query,,,Russia,Yandex enterprise network,nginx/1.6.2,"{""Date"": ""Mon, 23 Dec 2019 23:10:35 GMT"", ""Ser..."
4,2416,1050,365,Google Custom Search,24061,get,Software & Services,"{""q"": ""Dehri, Bihar, India""}",free,"{\n ""kind"": ""customsearch#search"",\n ""url"": ...","template,snippet,safe,type,url,items,cacheId,h...",query,,,United States,Google LLC,ESF,"{""Date"": ""Thu, 07 Oct 2021 19:14:31 GMT"", ""Var..."


In [3]:
# EDA for Security Test columns
api_df.shape

(113, 18)

In [4]:
api_df["api_endpoint_id"].nunique()

113

In [5]:
api_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113 entries, 0 to 112
Data columns (total 18 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   api_endpoint_id                                   113 non-null    int64  
 1   api_id                                            113 non-null    int64  
 2   api_vendor_id                                     113 non-null    int64  
 3   api                                               113 non-null    object 
 4   request_id                                        113 non-null    int64  
 5   method                                            113 non-null    object 
 6   category                                          113 non-null    object 
 7   parameters                                        113 non-null    object 
 8   usage_base                                        113 non-null    object 
 9   sample_response      

In [6]:
# Selecting relevant columns
security_test_df = api_df[["api_endpoint_id", "security_test_category", "security_test_result (FALSE=Passed; TRUE=Failed)"]]
security_test_df = security_test_df.rename(columns = {"security_test_result (FALSE=Passed; TRUE=Failed)":"security_test_result"})
security_test_df.head()

Unnamed: 0,api_endpoint_id,security_test_category,security_test_result
0,2513,,
1,2578,,
2,2575,,
3,2516,,
4,2416,,


In [7]:
security_test_df[~security_test_df["security_test_result"].isnull()]

Unnamed: 0,api_endpoint_id,security_test_category,security_test_result
13,2524,Broken Authentication,0.0
20,2544,Buffer Overflow,0.0
34,2583,Insecure Deserialization,0.0
43,2600,SQL Injection,1.0
44,2410,Cross-Site Scripting,0.0
46,2628,Cross-Site Scripting,1.0
90,2681,Broken Authentication,0.0
95,2730,SQL Injection,1.0
96,2761,Broken Authentication,1.0
99,2823,Buffer Overflow,0.0


In [8]:
# Define column types
categorical_features = ["security_test_category"]
ordinal_features = ["security_test_result"]
passthrough_features = ["api_endpoint_id"]

# Define levels for ordinal encoder
test_result_levels = [
    0.0,
    0.5,
    1.0
]

In [9]:
# Build preprocessor
categorical_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="Missing"),
    OneHotEncoder(handle_unknown="ignore", sparse=False),
)

ordinal_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value=0.5),
    OrdinalEncoder(categories=[test_result_levels], dtype=int)
)

preprocessor = make_column_transformer(
    ("passthrough", passthrough_features),
    (ordinal_transformer, ordinal_features),
    (categorical_transformer, categorical_features),
)

In [10]:
preprocessor

In [11]:
# Transform data using preprocessor
transformed = preprocessor.fit_transform(security_test_df)

In [12]:
transformed.shape

(113, 9)

In [13]:
# Get column names
ohe_features = list(preprocessor.named_transformers_['pipeline-2'].named_steps['onehotencoder'].get_feature_names())

In [14]:
feature_names = passthrough_features + ordinal_features + ohe_features
feature_names

['api_endpoint_id',
 'security_test_result',
 'x0_Broken Authentication',
 'x0_Buffer Overflow',
 'x0_Cross-Site Scripting',
 'x0_Insecure Deserialization',
 'x0_Missing',
 'x0_SQL Injection',
 'x0_XML External Entities']

In [15]:
# Create dataframe for security test features
X_transformed = pd.DataFrame(transformed, columns=feature_names)

In [18]:
# Check transformed data
X_transformed[X_transformed['security_test_result'] == 2.0]

Unnamed: 0,api_endpoint_id,security_test_result,x0_Broken Authentication,x0_Buffer Overflow,x0_Cross-Site Scripting,x0_Insecure Deserialization,x0_Missing,x0_SQL Injection,x0_XML External Entities
43,2600.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
46,2628.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
95,2730.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
96,2761.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
