In [34]:
import pandas as pd
from enum import Enum

In [37]:
maestri_df = pd.read_excel("data/Exchanges-database Maestri.xlsx", sheet_name="MAESTRI", dtype=str)

# Replace NaN values with empty strings
maestri_df = maestri_df.fillna("")

# Remove carets and asterisks
maestri_df.replace([r"\^|\*|#"], "", regex=True, inplace=True)

# Industry clasification standards (ICSs)
class IndustryStandard(Enum):
    NACE = "NACE"
    ISIC = "ISIC"
    WZ = "WZ"
    SSIC = "SSIC"

# ICSs as a list
ind_stds = [std.value for std in IndustryStandard]

# Roles of companies in industrial symbiosis
company_types = (("Providing", "Provider"), ("Intermediate", "Intermediary"), ("Receiving", "Receiver"))

# Retrieve the name of the column based on ICS and company role
old_col_name = lambda std, type: f"{std} code - {type} industry"

# Aggregate relevant column names for data validation
cols_list = [[old_col_name(std, next(iter(type))) for std in ind_stds] for type in company_types]

# Obtain subsets within the original dataset for validation
test_dfs = [maestri_df[cols].copy() for cols in cols_list]

# Rename column name to be more concise
new_col_name = lambda std, type: f"{type} {std} code"

# Rename columns within subsets
for i in range(len(company_types)):
    old_role, new_role = company_types[i]
    col_dict = dict()
    
    for std in ind_stds:
        k = old_col_name(std, old_role)
        v = new_col_name(std, new_role)
        
        col_dict.update({k: v})
    
    test_dfs[i] = test_dfs[i].rename(columns=col_dict)

Unnamed: 0,Intermediary NACE code,Intermediary ISIC code,Intermediary WZ code,Intermediary SSIC code
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
...,...,...,...,...
300,,,,
301,,,,
302,,,,
303,3832,3830,38320,28225


In [42]:
df = test_dfs[0]
t = company_types[0][1]

# Ensure the strings are of comparable length
def comparable_strs(lst):
    min_len = min([len(code) for code in lst])
    return [str[:min_len] for str in lst]

# Assumption: If the difference of codes of the same level is within a given threshold, then they are equal
def similarity_score(nace_code_str, std_code_str):
    diff_threshold = 5
    
    nace_codes = nace_code_str.split(";")
    std_codes = std_code_str.split(";")
    
    if len(nace_codes) > 1:
        # if len(nace_codes) == len(std_codes):
        return ""
    else:
        nace_code = nace_codes[0]
        
        if nace_code != "":
            codes = [nace_code, *std_codes]
            comparable_codes = comparable_strs(codes)
            
            diffs = [abs(int(nace_code) - int(code)) for code in comparable_codes]
            results = [1 if diff < diff_threshold else 0 for diff in diffs]
            
            return str(sum(results) / len(results)) if std_code_str != "" else ""

        return ""

for df in test_dfs:
    for type in company_types:
        role = type[1]
    
        for std in ind_stds[1:]:
            nace_col_name = new_col_name(ind_stds[0], role)
            std_col_name = new_col_name(std, role)
            similarity_score_col_name = f"{std} code match score"
            
            df[similarity_score_col_name] = [similarity_score(*a) for a in zip(df[nace_col_name], df[std_col_name])]



KeyError: 'Intermediary NACE code'