In [14]:
# Import MAESTRI dataset

# Import correspondence tables

# Match ISIC, WZ and SSIC codes to NACE codes

## ISIC doesn't need NLP

## WZ and SSIC have multiple codes for a given NACE code

### Use a LLM to find the best match given extra info



### 0. Utilities

#### Imports

In [15]:
import numpy as np
import pandas as pd
import re

In [16]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

from utils.constants import STANDARDS, DATA_PATH
from utils.types import IndustryStandard, IndustryCode
from utils.functions.files import read_inference, read_correspondence

#### Constants

In [17]:
# Non-NACE standards
NON_NACE_STDS = STANDARDS[1:]

# Standard code difference threshold, 't'
DIFF_THRESHOLD = 5

# Roles of companies in industrial symbiosis
old_roles = ("Providing", "Intermediate", "Receiving")

# New roles (for readability)
roles = ("Donor", "Intermediary", "Receiver")

inference_dfs = read_inference()

#### Helper functions

##### Obtaining column names

In [18]:
# Obtain column name based on the imported MAESTRI dataset, given ICS and company role
old_col = lambda std, role: f"{std.value} code - {role} industry"

# Obtain concise column name, given ICS and company role
new_col = lambda std, role: f"{role} {std.value} code"

# Obtain column name for similarity score, given ICS and company role
similarity_col = lambda std, role: f"{role} {std.value} code sim. score"

### 1. Reading the MAESTRI dataset

#### Importing the spreadsheet

In [19]:
# Read the MAESTRI dataset as a DataFrame
maestri_df = pd.read_excel(f"{DATA_PATH}/Exchanges-database Maestri.xlsx", sheet_name="MAESTRI", dtype=str)

# Replace NaN values with empty strings
maestri_df = maestri_df.fillna("")

# Remove carets, asterisks and hashes
maestri_df.replace([r"\^|\*|#"], "", regex=True, inplace=True)

#### Split the main dataset into DataFrames for each role (i.e., provider, intermediary, receiver)

In [20]:
# Aggregate relevant column names for data validation
cols_list = [[old_col(std, role) for std in STANDARDS] for role in old_roles]

# Obtain subsets within the original dataset for validation
test_dfs = [maestri_df[cols].copy() for cols in cols_list]

# Rename columns within subsets
for i in range(len(roles)):
    col_dict = dict()
    old_role = old_roles[i]
    new_role = roles[i]
    
    for std in STANDARDS:
        k = old_col(std, old_role)
        v = new_col(std, new_role)
        
        col_dict.update({k: v})
    
    test_dfs[i] = test_dfs[i].rename(columns=col_dict)
    
    # Drop rows with null values for the NACE code
    # Source: https://stackoverflow.com/questions/29314033/drop-rows-containing-empty-cells-from-a-pandas-dataframe
    std = STANDARDS[i + 1]
    test_dfs[i] = test_dfs[i][   test_dfs[i][new_col(std, new_role)].astype(bool)   ]

In [21]:
test_dfs[0].head()

Unnamed: 0,Donor NACE code,Donor ISIC code,Donor WZ code,Donor SSIC code
0,1920,1920,19200,19201
1,1920,1920,19200,19201
2,2410,2410,24520,24310
3,2410,2410,24520,24310
4,2351,2394,23510;23650;23610,23940


In [22]:
correspondence_dfs = read_correspondence()
inference_dfs = read_inference()

In [23]:
# std1 = IndustryStandard.NACE
# std2 = IndustryStandard.SSIC

# correspond_df = correspondence_dfs[std1]
# inference_df1 = inference_dfs[std1]
# inference_df2 = inference_dfs[std2]

In [24]:
def get_level(code) -> int:
    df = inference_dfs[code.std]
    
    if code.value in df.index:
        return df.loc[code.value, "Level"]

    return None

def info(code):
    df = inference_dfs[code.std].copy()
    
    if code.value in df.index:
        df = df[df["Level"] == get_level(code)]
        df = df[df.columns[~df.columns.isin(["Level", "Parent", "ISIC code"])]]
        
        string = ""
        
        for col in df.columns:
            text = df.loc[code.value, col]
            text = re.sub(r"\s*\(\d+\.\d+(?:\,\s*\d+\.\d+)*\)", "", text)
            
            string += f"{col}: {text}\n"
                        
        return string.strip()
        
    return None

c = IndustryCode(IndustryStandard.NACE, "03")
print(info(c))

Description: Fishing and aquaculture
Examples: This division includes capture fishery and aquaculture, covering the use of fishery resources from marine, brackish or freshwater environments, with the goal of capturing or gathering fish, crustaceans, molluscs and other marine organisms and products (e.g. aquatic plants, pearls, sponges etc). Also included are activities that are normally integrated in the process of production for own account (e.g. seeding oysters for pearl production). Service activities incidental to marine or freshwater fishery or aquaculture are included in the related fishing or aquaculture activities.
Exclusions: This division does not include building and repairing of ships and boats and sport or recreational fishing activities. Processing of fish, crustaceans or molluscs is excluded, whether at land-based plants or on factory ships.


### 2. Obtaining similarity scores for validation

In [25]:
# Loop through all company types
for i in range(len(roles)):
    # Role: either 'Donor', 'Intermediary' or 'Receiver'
    role = roles[i]
    
    # NACE column, example: 'Provider NACE code'
    nace_col = new_col(STANDARDS[0], role)
    
    # Iterate through all standards except NACE as it is to be compared with
    for std in NON_NACE_STDS:
        df = test_dfs[i]
        
        # Standard column, example for ISIC: 'Donor ISIC code'
        std_col = new_col(std, role)
        
        # Zip the NACE and standard columns to iterate through
        tuples = zip(df[nace_col], df[std_col])
        
        # Append the similarity score column of a certain standard to the DataFrame for a given role
        # df[similarity_col(std, role)] = [similarity_score(*a, std) for a in tuples]
        
        # Convert the similarity score column data type to 'float'
        # df[similarity_col(std, role)] = df[similarity_col(std, role)].astype(float)
    
    # List containing new order of columns for readability
    cols = [nace_col] + [f(std, role) for std in NON_NACE_STDS for f in (new_col,)]
    
    # Reorder columns for readability
    test_dfs[i] = test_dfs[i][cols]

In [26]:
test_dfs[1].head()

Unnamed: 0,Intermediary NACE code,Intermediary ISIC code,Intermediary WZ code,Intermediary SSIC code
28,150,150,1500,01412
30,382,382,382,382
31,610,610,6100,19201;09001
50,610,610,6100,19201;09001
56,1081,1072,10810,10720


In [27]:
role_index = 0
role = roles[role_index]

df = test_dfs[role_index]
cols = [new_col(IndustryStandard.NACE, role), new_col(IndustryStandard.WZ, role)]

df[cols]

Unnamed: 0,Donor NACE code,Donor WZ code
0,1920,19200
1,1920,19200
2,2410,24520
3,2410,24520
4,2351,23510;23650;23610
...,...,...
300,1081,10810;01140
301,1081,10810;01140
302,1081,10810;01140
303,261,2051;2052;2053;2059


In [28]:
import transformers
import torch

model_id = "QuantFactory/Meta-Llama-3-8B-GGUF-v2"

pipeline = transformers.pipeline(
    "text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto"
)

pipeline("Hey how are you doing today?")



OSError: QuantFactory/Meta-Llama-3-8B-GGUF-v2 does not appear to have a file named config.json. Checkout 'https://huggingface.co/QuantFactory/Meta-Llama-3-8B-GGUF-v2/main' for available files.