In [None]:
import pandas as pd
import numpy as np

In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

from utils import *

In [3]:
inference_dfs = read_exports()

In [4]:
import spacy
from spacy import displacy
from spacy.language import Language

nlp = spacy.load("en_core_web_md")

# Add the function to the pipeline
# config = {"punct_chars": ["•", "\n", "-"]}
# nlp.add_pipe("sentencizer", config=config, before="parser")

In [5]:
# from sentence_transformers import SentenceTransformer

# Load a pretrained Sentence Transformer model
# model = SentenceTransformer("all-MiniLM-L6-v2")

In [31]:
def get_level(code) -> int:
    df = inference_dfs[code.std]
    
    if code.value in df.index:
        return df.loc[code.value, "Level"]

    return None

def get_description(code) -> str:
    df = inference_dfs[code.std]
    
    if code.value in df.index:
        return df.loc[code.value, "Description"]

    return None

def get_children(code):
    df = inference_dfs[code.std]
    
    if code.value == "" or code.value in df.index:
        return [IndustryCode(code.std, value) for value in df[df["Parent"] == code.value].index]

    return None

def get_parent(code, level=-1):
    df = inference_dfs[code.std]
    
    if code.value in df.index:        
        v = df.loc[code.value, "Parent"]
        l = df.loc[code.value, "Level"]
        
        # Level == -1 (immediate parent)
        while level != -1 and l > level:
            v = df.loc[v, "Parent"]
            l = df.loc[v, "Level"]
        
        return IndustryCode(code.std, v)

    return None

# Evaluate the highest common level (HCL)
def get_hcl_code(code, std):
    c = code
    df = inference_dfs[std]
    
    while c.value not in df.index:
        print(std.value + ": " + c.value)
        c = get_parent(c)
    
    print(std.value + ": " + c.value)
    
    return IndustryCode(std, c.value)

In [7]:
SSIC_df = inference_dfs[IndustryStandard.SSIC]
NACE_df = inference_dfs[IndustryStandard.NACE]

In [33]:
NACE_codes = dict()

def match_nace(ssic_code):
    # If the SSIC code is of the highest level,
    if get_level(ssic_code) == max(SSIC_df["Level"]):
        # Return the NACE equivalent code of its parent
        ssic_parent = get_parent(ssic_code)
        return NACE_codes[ssic_parent.value]
    
    # Retrieve the highest common level and the corresponding NACE code
    nace_code = get_hcl_code(ssic_code, IndustryStandard.NACE)
    hcl = get_level(nace_code)
    
    print("NACE HCL code: " + nace_code.value)
    
    # If the HCL is 1, return the code as it is
    # Note: Level 1 codes are the same across all standards
    if hcl == 1:
        print(nace_code.value)
        return nace_code.value
    
    # Obtain the NACE parent code for the same level
    nace_parent = get_parent(nace_code, hcl)
    
    # Obtain all children of the NACE parent code
    nace_children = get_children(nace_parent)
    
    for nace_child in nace_children:
        print(nace_child.value)
        
        if nace_child.value == ssic_code.value:
            return nace_child.value
    
    # Create a spaCy doc from the description of the SSIC code
    ssic_desc_doc = nlp(get_description(ssic_code).lower())
    
    # Calculate the cosine similarities between the SSIC code description
    # and that of the NACE parent code's children
    similarities = np.array([ssic_desc_doc.similarity(
        nlp(get_description(nace_child).lower())
    ) for nace_child in nace_children])
    
    # for i, score in enumerate(similarities):
        # nace_child = nace_children[i]
        # print(f"SSIC {ssic_code.value}: {get_description(ssic_code)} ; NACE {nace_child.value}: {get_description(nace_child)}; score={score}")
    
    return nace_children[np.argmax(similarities)].value

# for v in SSIC_df.index:
#     c = IndustryCode(IndustryStandard.SSIC, v)
#     NACE_codes[v] = match_nace(c)

c = IndustryCode(IndustryStandard.SSIC, "170")
match_nace(c)

NACE: 170
NACE: 17
NACE HCL code: 17
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33


'17'

In [9]:
NACE_codes

{'A': 'A',
 '01': '01',
 '011': '011',
 '0111': '0111',
 '01111': '0111',
 '01112': '0111',
 '01113': '0111',
 '01119': '0111',
 '0112': '0112',
 '01120': '0112',
 '0113': '0113',
 '01130': '0113',
 '0114': '0114',
 '01141': '0114',
 '01142': '0114',
 '01149': '0114',
 '0119': '0119',
 '01190': '0119',
 '014': '014',
 '0141': '0141',
 '01411': '0141',
 '01412': '0141',
 '01413': '0141',
 '0142': '0142',
 '01421': '0142',
 '01422': '0142',
 '01423': '0142',
 '01424': '0142',
 '0149': '0149',
 '01491': '0149',
 '01492': '0149',
 '01493': '0149',
 '01494': '0149',
 '01499': '0149',
 '015': '015',
 '0150': '0150',
 '01500': '0150',
 '016': '016',
 '0160': '017',
 '01600': '017',
 '02': '02',
 '020': '01',
 '0200': '01',
 '02000': '01',
 '03': '03',
 '031': '031',
 '0310': '031',
 '03101': '031',
 '03102': '031',
 '03103': '031',
 '03109': '031',
 '032': '032',
 '0320': '031',
 '03201': '031',
 '03202': '031',
 '03209': '031',
 'B': 'B',
 '08': '08',
 '081': '081',
 '0810': '081',
 '08101':

In [15]:
SSIC_df[SSIC_df["Parent"] == "17"]

Unnamed: 0_level_0,Level,Parent,Description,Definition,Examples
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
170,3,17,MANUFACTURE OF PAPER AND PAPER PRODUCTS,,


In [10]:
# import ollama

# stream = ollama.chat(
#     model='llama3',
#     messages=[{'role': 'user', 'content': 'Why is the sky blue?'}],
#     stream=True,
# )

# for chunk in stream:
#   print(chunk['message']['content'], end='', flush=True)

In [11]:
SSIC_df["NACE code"] = SSIC_df.index.apply(lambda v: NACE_codes.get(v))

SSIC_df.head()

AttributeError: 'Index' object has no attribute 'apply'

In [None]:
filepath = Path("exports/SSIC_new.csv")  
filepath.parent.mkdir(parents=True, exist_ok=True)  

SSIC_df.to_csv(filepath)

In [None]:
# Calculate embeddings by calling model.encode()
# embeddings = model.encode(sentences)
# print(embeddings.shape)
# [3, 384]

# from sklearn.metrics.pairwise import cosine_similarity

# similarities = cosine_similarity(embeddings, embeddings)
# print(similarities)

In [None]:
# def match_codes(std, other):
    # Break up paragraphs
    
    
    # Sentence segmentation
    
    
    # Tokenisation
    
    
    # Word embeddings
    
    
    # Cosine similarity
    

In [None]:
# for std, df in std_dfs.items():
    # Create new DataFrame with columns that are the same as the ones in the standard's columns
    # inference_df = pd.DataFrame(columns=df.columns)
    
    # for other in stds:
        # if other != std:
            # inference_df[f"{other.value} code"] = match_codes(std, other)
    
    # Add the DataFrame to the inference DataFrame dictionary
    # inference_dfs[std] = inference_df