## Import libraries

In [2]:
import pandas as pd

## Read in excel file to pandas

In [3]:
rule_df = pd.read_excel(
    "../data/raw/RiskClassification_Data_Endpoints_V2.xlsx", sheet_name="RiskRules")

# drop the first coulmn
rule_df = rule_df.iloc[:, 1:]
# change server_location to Amaricas to Americas
rule_df["server_location"] = rule_df["server_location"].replace(
    "Amaricas", "Americas")
# fill the empty values with "None"
rule_df = rule_df.fillna("None")
# remove duplicates
rule_df = rule_df.drop_duplicates()
rule_df

Unnamed: 0,authentication,security_test_category,security_test_result,server_location,hosting_isp,PII,FII,Risk_Label
0,No Authentication,Injections,Fail,Anywhere,Anyone,,,Imminent
5,Some Authentication,Broken Authentication,Fail,Anywhere,Anyone,Yes,Yes,Imminent
10,No Authentication,No Test Performed/Available,,Anywhere,Anyone,Yes,Yes,High
15,Some Authentication,Broken Authentication,Fail,Anywhere,Anyone,No,No,High
20,Some Authentication,All Tests Performed/Available,Pass,Russia,Anyone,Yes,Yes,High
25,Some Authentication,All Tests Performed/Available,Pass,China,Anyone,Yes,Yes,High
30,Some Authentication,No Test Performed/Available,,Anywhere,Anyone,No,No,Medium
35,Some Authentication,Injections,Fail,Anywhere,Anyone,Yes,Yes,Medium
40,Some Authentication,Buffer Overflow,Fail,Anywhere,Anyone,Yes,Yes,Medium
45,Some Authentication,XML External Enteties,Fail,Anywhere,Anyone,Yes,Yes,Medium


In [8]:
api_df = pd.read_excel("../data/processed/df_pii.xlsx")
api_df = api_df[['api_endpoint_id', 'authentication', 'security_test_category',
                 'security_test_result', 'server_location', 'hosting_isp', 'is_pii', 'is_fii']]
api_df


Unnamed: 0,api_endpoint_id,authentication,security_test_category,security_test_result,server_location,hosting_isp,is_pii,is_fii
0,1,query,,,United States,"DigitalOcean, LLC",True,False
1,2,query,,,Canada,Google LLC,True,False
2,2,query,,,Canada,Google LLC,True,False
3,2,query,,,Canada,Google LLC,True,False
4,2,query,,,Canada,Google LLC,True,False
...,...,...,...,...,...,...,...,...
1880,2648,none,,,United States,Amazon Technologies Inc.,False,False
1881,2649,none,,,Ireland,BellSouth.net Inc.,True,False
1882,2650,none,,,Ireland,"Amazon.com, Inc.",True,False
1883,2651,none,,,Ireland,Amazon Technologies Inc.,True,False


In [13]:
# get row containing High
high_risk_rule_df = rule_df[rule_df["Risk_Label"] == "High"]
high_risk_rule_df


Unnamed: 0,authentication,security_test_category,security_test_result,server_location,hosting_isp,PII,FII,Risk_Label
10,No Authentication,No Test Performed/Available,,Anywhere,Anyone,Yes,Yes,High
15,Some Authentication,Broken Authentication,Fail,Anywhere,Anyone,No,No,High
20,Some Authentication,All Tests Performed/Available,Pass,Russia,Anyone,Yes,Yes,High
25,Some Authentication,All Tests Performed/Available,Pass,China,Anyone,Yes,Yes,High


In [14]:
# drop Risk_Label column
high_risk_rule_df = high_risk_rule_df.drop(columns=["Risk_Label"])
high_risk_rule_df


Unnamed: 0,authentication,security_test_category,security_test_result,server_location,hosting_isp,PII,FII
10,No Authentication,No Test Performed/Available,,Anywhere,Anyone,Yes,Yes
15,Some Authentication,Broken Authentication,Fail,Anywhere,Anyone,No,No
20,Some Authentication,All Tests Performed/Available,Pass,Russia,Anyone,Yes,Yes
25,Some Authentication,All Tests Performed/Available,Pass,China,Anyone,Yes,Yes


In [49]:
rules_dict = {}
rules_dict["Some Authentication"] = ["query", "OAuth2",
                                    "header", "BasicAuth", "body", "OAuth1", "path"]
rules_dict["No Authentication"] = ["None"]

rules_dict["All Tests Performed/Available"] = ['XML External Entities',
                                                      'Cross-Site Scripting', 'Insecure Deserialization', 'SQL Injection']
rules_dict["Broken Authentication"] = ['Broken Authentication']
rules_dict["No Test Performed/Available"] = ['None']

rules_dict["None"] = ["None"]
rules_dict["Fail"] = [0]
rules_dict["Pass"] = [1]

rules_dict["Russia"] = ["Russia"]
rules_dict["China"] = ["China"]
rules_dict["Anywhere"] = ['United States', 'Canada', 'Germany', 'Netherlands',
                             'United Kingdom', 'Ireland', 'Australia', 'India', 'Luxembourg',
                             'Sweden', 'France', 'Spain', 'Bangladesh', 'Singapore',
                             'Czechia']
rules_dict["Anyone"] = ['United States', 'Canada', 'Germany', 'Netherlands',
                        'United Kingdom', 'Ireland', 'Australia', 'India', 'Luxembourg',
                        'Sweden', 'France', 'Spain', 'Bangladesh', 'Singapore', 'Russia',
                        'Czechia']
rules_dict["Yes"] = ["Yes"]
rules_dict["No"] = ["No"]

                             


In [9]:
# get columns from api_df
api_df_cols = api_df.columns
api_df_cols


Index(['api_endpoint_id', 'authentication', 'security_test_category',
       'security_test_result', 'server_location', 'hosting_isp', 'is_pii',
       'is_fii'],
      dtype='object')

In [12]:
# create dataframe with columns from api_df_cols
new_df = pd.DataFrame(columns=  api_df_cols)
new_df


Unnamed: 0,api_endpoint_id,authentication,security_test_category,security_test_result,server_location,hosting_isp,is_pii,is_fii


In [54]:
import random
starting_id = 3000
# create 100 rows
for i in range(10):
    # generate random values for each column
    starting_id += 1
    new_df.loc[i, "api_endpoint_id"] = starting_id
    # pick a random row from high_risk_rule_df
    random_row = high_risk_rule_df.sample(1)

    #authentication
    auth_rule_pick = random_row['authentication'].to_string(index=False)
    auth_rule_value_list = rules_dict[auth_rule_pick]
    # pick random value from auth_rule_value_list
    auth_rule_value = random.choice(auth_rule_value_list)
    new_df.loc[i, "authentication"] = auth_rule_value

    #security_test_category
    security_test_category_rule_pick = random_row['security_test_category'].to_string(index=False)
    security_test_category_rule_value_list = rules_dict[security_test_category_rule_pick]
    security_test_category_rule_value = random.choice(security_test_category_rule_value_list)
    new_df.loc[i, "security_test_category"] = security_test_category_rule_value

    #security_test_result
    security_test_result_rule_pick = random_row['security_test_result'].to_string(index=False)
    security_test_result_rule_value_list = rules_dict[security_test_result_rule_pick]
    security_test_result_rule_value = random.choice(security_test_result_rule_value_list)
    new_df.loc[i, "security_test_result"] = security_test_result_rule_value

    #server_location
    server_location_rule_pick = random_row['server_location'].to_string(index=False)
    server_location_rule_value_list = rules_dict[server_location_rule_pick]
    server_location_rule_value = random.choice(server_location_rule_value_list)
    new_df.loc[i, "server_location"] = server_location_rule_value

    #hosting_isp
    hosting_isp_rule_pick = random_row['hosting_isp'].to_string(index=False)
    hosting_isp_rule_value_list = rules_dict[hosting_isp_rule_pick]
    hosting_isp_rule_value = random.choice(hosting_isp_rule_value_list)
    new_df.loc[i, "hosting_isp"] = hosting_isp_rule_value
    

    #is_pii
    is_pii_rule_pick = random_row['PII'].to_string(index=False)
    is_pii_rule_value_list = rules_dict[is_pii_rule_pick]
    is_pii_rule_value = random.choice(is_pii_rule_value_list)
    new_df.loc[i, "is_pii"] = is_pii_rule_value

    #is_fii
    is_fii_rule_pick = random_row['FII'].to_string(index=False)
    is_fii_rule_value_list = rules_dict[is_fii_rule_pick]
    is_fii_rule_value = random.choice(is_fii_rule_value_list)
    new_df.loc[i, "is_fii"] = is_fii_rule_value



new_df 


Unnamed: 0,api_endpoint_id,authentication,security_test_category,security_test_result,server_location,hosting_isp,is_pii,is_fii
0,3001,query,XML External Entities,1.0,China,Ireland,Yes,Yes
1,3002,,,,Canada,United Kingdom,Yes,Yes
2,3003,,,,France,Netherlands,Yes,Yes
3,3004,,,,Canada,United States,Yes,Yes
4,3005,body,Broken Authentication,0.0,France,United Kingdom,No,No
5,3006,path,SQL Injection,1.0,Russia,Canada,Yes,Yes
6,3007,OAuth1,Broken Authentication,0.0,United States,India,No,No
7,3008,OAuth1,SQL Injection,1.0,Russia,Canada,Yes,Yes
8,3009,BasicAuth,Broken Authentication,0.0,United States,Russia,No,No
9,3010,,,,Netherlands,Spain,Yes,Yes
