In [1]:
# Import MAESTRI dataset

# Import correspondence tables

# Match ISIC, WZ and SSIC codes to NACE codes

## ISIC doesn't need NLP

## WZ and SSIC have multiple codes for a given NACE code

### Use a LLM to find the best match given extra info

### 0. Utilities

#### Imports

In [3]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

from utils.constants import EXPORTS_PATH
from utils.inference import read_inference
from utils.maestri import (MAESTRI_DESC_COL, MAESTRI_ROLES, NON_NACE_STDS,
                           export_maestri, get_maestri_code_col, read_maestri)
from utils.maestri.graphs import plot_scores_by_role, plot_scores_by_std
from utils.maestri.similarity import (calc_similarity, get_similarity_col,
                                      str_to_codes)
from utils.maestri.stats import get_similarity_matrix
from utils.maestri.validation import validate_maestri
from utils.types import IndustryCode, IndustryStandard

In [4]:
class NACECompany:
    def __init__(self, code: str, desc: str):
        self.code = IndustryCode(IndustryStandard.NACE, code)
        self.description = desc

### 1. Reading the MAESTRI dataset

#### Importing the spreadsheet

In [5]:
inference_dfs = read_inference()
NACE_df = inference_dfs[IndustryStandard.NACE]

In [6]:
# Read the MAESTRI dataset as a DataFrame
maestri_dfs = read_maestri()

def simplify_nace_code_str(string):
    if "," in string:
        return string.split(",")[0]
    
    return string.split(";")[0]

for i in range(len(MAESTRI_ROLES)):
    role = MAESTRI_ROLES
    nace_col = get_maestri_code_col(IndustryStandard.NACE)

    maestri_dfs[i][nace_col] = maestri_dfs[i][nace_col].map(simplify_nace_code_str)

In [7]:
maestri_dfs[0].head()

Unnamed: 0,Company description,NACE code,ISIC code,WZ code,SSIC code
0,Refinery,1920,1920,19200,19201
1,Refinery,1920,1920,19200,19201
2,Steelworks,2410,2410,24520,24310
3,Steelworks,2410,2410,24520,24310
4,Cement production,2351,2394,23510;23650;23610,23940


In [11]:
# def get_match_prompt_from_common_parent(from_code, to_std):
#     common_parent = get_common_parent(from_code, to_std)
#     return get_match_prompt(from_code, get_children(common_parent))

# c = IndustryCode(IndustryStandard.SSIC, "01")
# print(get_match_prompt_from_common_parent(c, IndustryStandard.WZ))

### 2. Obtaining similarity scores for validation

In [13]:
# Loop through all company types
for i in range(len(MAESTRI_ROLES)):
    # Role: either 'Donor', 'Intermediary' or 'Receiver'
    role = MAESTRI_ROLES[i]
    # role = "Intermediary"
    
    # NACE column, example: 'Provider NACE code'
    nace_col = get_maestri_code_col(IndustryStandard.NACE)

    print(role)
    
    # Iterate through all standards except NACE as it is to be compared with
    for std in NON_NACE_STDS:
        print(std.value)

        df = maestri_dfs[i].copy()
        
        # Standard column, example for ISIC: 'Donor ISIC code'
        code_col = get_maestri_code_col(std)
        
        # Zip the NACE and standard columns to iterate through
        tuples = zip(df[nace_col], df[MAESTRI_DESC_COL], df[code_col])
        
		### CHANGE THIS IN THE MORNING
		to_codes = [code for code in str_to_codes(to_guesses_str, to_std) if get_parent(code, level=1).value == get_parent(from_code, level=1).value]
        
        ### -> INTRODUCE SIMILAR CODES HERE
        df[code_col] = [get_match(    NACECompany(t[0], t[1]), str_to_codes(std, t[2])    ).value for t in tuples]

        maestri_dfs[i][code_col] = df[code_col]

Providing
ISIC
WZ
SSIC
Intermediate
ISIC
WZ
SSIC
Receiving
ISIC
WZ
SSIC


In [16]:
from pathlib import Path

# Save data to new CSV files
for i in range(len(MAESTRI_ROLES)):
    role = MAESTRI_ROLES[i]
    filepath = Path(EXPORTS_PATH / f"MAESTRI_{role}.csv")
    filepath.parent.mkdir(parents=True, exist_ok=True)  

    maestri_dfs[i].to_csv(filepath)

In [36]:
# This code within the receiving list is causing issues.
# The description 'Manufacture of plastic plates, sheets, tubes and profiles'
# is showing up as the SSIC code, preventing me from calculating the similarity
# scores for the entire list. I had to export the DataFrame as a CSV,
# manually change the 'code' and run the similarity function, after which
# it worked as it should have.

# Row info:
# NACE: 2221; Guesses: ISIC: 2220, SSIC: 42201, WZ: 37001
# Company description: PVC sewer pipes production

company = NACECompany("2221", "PVC sewer pipes production")
c = get_match(company, IndustryStandard.WZ, "37001")
inference_dfs[IndustryStandard.WZ].loc[c.value]

Prompt: Classify the company below given its NACE code and description to the most similar of the following WZ codes using their descriptions, examples and exclusions. When returning the result as JSON, the keys for the given code should be 'NACE' and for the matched code as 'WZ':

Company description: PVC sewer pipes production
NACE Code: 2221
Description: Manufacture of plastic plates, sheets, tubes and profiles
Examples: This class includes:- manufacture of semi-manufactures of plastic products:  . plastic plates, sheets, blocks, film, foil, strip etc. (whether self-adhesive or not)- manufacture of finished plastic products:  . plastic tubes, pipes and hoses; hose and pipe fittings  . cellophane film or sheet
Exclusions: This class excludes:- manufacture of plastics in primary forms, see 20.16- manufacture of articles of synthetic or natural rubber, see 22.1

WZ Code: 2221
Description: Manufacture of plastic plates, sheets, tubes and profiles

WZ Code: 2222
Description: Manufacture 

Level                                                          5
Parent                                                      2221
Description    Manufacture of plastic plates, sheets, tubes a...
Name: 22210, dtype: object

In [42]:
maestri_dfs = validate_maestri(maestri_dfs)

### 3. Model performance statistics

#### Summary

In [46]:
get_similarity_matrix(maestri_dfs, is_percent=True) * 100

Unnamed: 0,s = -1,s = 0,0 < s < 1,s = 1
ISIC,0.0,0.0,0.0,100.0
WZ,0.0,5.389222,0.0,94.610778
SSIC,0.0,10.02994,0.0,89.97006


#### Visualising similarity scores

##### By company role and industry classification standard

In [48]:
plot_scores_by_role(maestri_dfs)

##### By industry classification standard

In [50]:
plot_scores_by_std(maestri_dfs)

### 4. Exporting results to Excel

In [53]:
export_maestri(maestri_dfs)