In [1]:
# import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

from utils.types import IndustryStandard, IndustryCode
from utils.inference import read_inference

In [3]:
inference_dfs = read_inference()

In [4]:
import spacy
from spacy import displacy
from spacy.language import Language

nlp = spacy.load("en_core_web_md")

# Add the function to the pipeline
# config = {"punct_chars": ["•", "\n", "-"]}
# nlp.add_pipe("sentencizer", config=config, before="parser")

In [5]:
# from sentence_transformers import SentenceTransformer

# Load a pretrained Sentence Transformer model
# model = SentenceTransformer("all-MiniLM-L6-v2")

In [6]:
def get_level(code) -> int:
    df = inference_dfs[code.std]
    
    if code.value in df.index:
        return df.loc[code.value, "Level"]

    return None

def get_description(code) -> str:
    df = inference_dfs[code.std]
    
    if code.value in df.index:
        return df.loc[code.value, "Description"]

    return None

def get_children(code):
    df = inference_dfs[code.std]
    
    if code.value == "" or code.value in df.index:
        return [IndustryCode(code.std, value) for value in df[df["Parent"] == code.value].index]

    return None

def get_parent(code, level=-1):
    df = inference_dfs[code.std]
    
    if code.value in df.index:        
        v = df.loc[code.value, "Parent"]
        l = df.loc[v, "Level"]
        
        # Level == -1 (immediate parent)
        while level != -1 and l > level:
            v = df.loc[v, "Parent"]
            l = df.loc[v, "Level"]
        
        return IndustryCode(code.std, v)

    return None

# Evaluate the highest common level (HCL)
def get_common_parent(code, std):
    # from_df = inference_dfs[code.std]
    to_df = inference_dfs[std]
    level = get_level(code)
    
    if level == 1:
        return None
    
    parent = get_parent(code)
    
    if parent.value in to_df.index and len(get_children(parent)) > 0:
        return IndustryCode(std, parent.value)
    
    while parent.value not in to_df.index:
        # print(std.value + ": " + c.value)
        parent = get_parent(parent)
        # print(parent.value)
    
    return IndustryCode(std, parent.value)

In [7]:
c1 = IndustryCode(IndustryStandard.SSIC, "0200")
c2 = IndustryCode(IndustryStandard.SSIC, "020")

parent = get_parent(c1)
common_parent = get_common_parent(c1, IndustryStandard.NACE)

print(f"SSIC parent: {parent.value}, NACE common parent: {common_parent.value}")
# [ch.value for ch in get_children(parent)]

SSIC parent: 020, NACE common parent: 02


In [8]:
SSIC_df = inference_dfs[IndustryStandard.SSIC]
NACE_df = inference_dfs[IndustryStandard.NACE]

In [9]:
NACE_codes = dict()

def match_nace(ssic_code):
    # Note: Level 1 codes are the same across all standards
    if get_level(ssic_code) == 1:
        print(ssic_code.value)
        return ssic_code.value
    
    # If the SSIC code is of the highest level,
    if get_level(ssic_code) == max(SSIC_df["Level"]):
        # Return the NACE equivalent code of its parent
        ssic_parent = get_parent(ssic_code)
        return NACE_codes[ssic_parent.value]
    
    # Retrieve the highest common level and the corresponding NACE code
    common_parent = get_common_parent(ssic_code, IndustryStandard.NACE)
    # cpl = get_level(common_parent)
    
    # print(f"NACE CPL code: {common_parent.value}, Level: {cpl}")
    
    # If the HCL is 1, return the code as it is
    
    # if cpl == None:
    #     print(ssic_code.value)
    #     return ssic_code.value
    
    # Obtain all children of the NACE parent code
    nace_children = get_children(common_parent)
    
    if len(nace_children) == 1:
        return nace_children[0].value
    
    # print(*[c.value for c in nace_children])
    
    for nace_child in nace_children:
        # print(nace_child.value)
        
        if nace_child.value == ssic_code.value:
            return nace_child.value
    
    # Create a spaCy doc from the description of the SSIC code
    ssic_desc_doc = nlp(get_description(ssic_code).lower())
    
    # Calculate the cosine similarities between the SSIC code description
    # and that of the NACE parent code's children
    similarities = np.array([ssic_desc_doc.similarity(
        nlp(get_description(nace_child).lower())
    ) for nace_child in nace_children])
    
    # for i, score in enumerate(similarities):
        # nace_child = nace_children[i]
        # print(f"SSIC {ssic_code.value}: {get_description(ssic_code)} ; NACE {nace_child.value}: {get_description(nace_child)}; score={score}")
    
    return nace_children[np.argmax(similarities)].value

for v in SSIC_df.index:
    c = IndustryCode(IndustryStandard.SSIC, v)
    NACE_codes[v] = match_nace(c)

# c = IndustryCode(IndustryStandard.SSIC, "0200")
# match_nace(c)

A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U


In [10]:
# import ollama

# stream = ollama.chat(
#     model='llama3',
#     messages=[{'role': 'user', 'content': 'Why is the sky blue?'}],
#     stream=True,
# )

# for chunk in stream:
#   print(chunk['message']['content'], end='', flush=True)

In [11]:
SSIC_df["NACE code"] = SSIC_df.index.map(lambda v: NACE_codes.get(v))

In [12]:
SSIC_df.loc["02": "03"]

Unnamed: 0_level_0,Level,Parent,Description,Definition,Examples,NACE code
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,2,A,"FORESTRY, LOGGING AND RELATED SERVICE ACTIVITIES",,,2
20,3,02,"FORESTRY, LOGGING AND RELATED SERVICE ACTIVITIES",,,21
200,4,020,"Forestry, Logging and Related Service Activities",,,21
2000,5,0200,"Forestry, logging and related service activities","This Sub-class includes the extraction, gather...",,21
3,2,A,"FISHING, OPERATION OF FISH HATCHERIES AND FISH...",,,3


In [13]:
filepath = Path("../exports/SSIC_new.csv")  
filepath.parent.mkdir(parents=True, exist_ok=True)  

SSIC_df[["NACE code"]].to_csv(filepath)

In [14]:
# Calculate embeddings by calling model.encode()
# embeddings = model.encode(sentences)
# print(embeddings.shape)
# [3, 384]

# from sklearn.metrics.pairwise import cosine_similarity

# similarities = cosine_similarity(embeddings, embeddings)
# print(similarities)

In [15]:
# def match_codes(std, other):
    # Break up paragraphs
    
    
    # Sentence segmentation
    
    
    # Tokenisation
    
    
    # Word embeddings
    
    
    # Cosine similarity
    

In [16]:
# for std, df in std_dfs.items():
    # Create new DataFrame with columns that are the same as the ones in the standard's columns
    # inference_df = pd.DataFrame(columns=df.columns)
    
    # for other in stds:
        # if other != std:
            # inference_df[f"{other.value} code"] = match_codes(std, other)
    
    # Add the DataFrame to the inference DataFrame dictionary
    # inference_dfs[std] = inference_df