## Package imports

In [1]:
from enum import Enum
import pandas as pd

## Text cleaning

- Cleared the print area to remove warnings produced after reading from spreadsheets

### Helper functions

In [2]:
# Insert the list of levels
def insert_levels(df):
    levels = df.index.str.len()
    df.insert(0, "Level", levels)

# Insert the list of parents given an industry classification
def insert_parents(df):
    parents = []
    history = []
    prev = {"Level": 0, "Code": ""}

    # Iterate through the dataframe to deduce the parent
    for row in df.itertuples():
        prev_level = len(history)
        current_level = row.Level
        current_code = row.Index
        
        ## If the current level is '1'
        if current_level == 1:
            current_parent = ""
            history = [current_code]
            
        ## If the current level is lower than the previous level
        elif current_level > prev_level:
            current_parent = history[prev_level - 1]
            history.append(current_code)
            
        ## If the current level is higher than the previous level
        elif current_level < prev_level:
            current_parent = history[(current_level - 1) - 1]
            del history[current_level - 1:]
            history.append(current_code)
        
        else:
            current_parent = history[(current_level - 1) - 1]
        
        parents.append(current_parent)
        prev.update({"Level": current_level, "Code": current_code})
        
    # Insert the list as a column
    df.insert(1, "Parent", parents)

# Usual string cleaning techniques
def default_clean(df):
    # Normalisation (lowercase strings)
    
    lower = lambda x: x.lower() if isinstance(x, str) else x
    
    ## If the dataframe has ISIC code data
    if "ISIC code" in df:
        if "Parent" in df:
            df.iloc[:, 2:-1] = df.iloc[:, 2:-1].map(lower)
        else:
            df.iloc[:, 1:-1] = df.iloc[:, 1:-1].map(lower)
        
    ## If the dataframe doesnt' have ISIC code data
    else:
        df.loc[:, "Description":] = df.loc[:, "Description":].map(lower)
    
    # Replace punctuation with empty strings
    # df.loc[:, "Description":] = df.loc[:, "Description":].replace(r"[^\w\s]+", " ", regex=True)
    
    # Replace newlines and multiple spaces with whitespaces
    df.replace([r"\n", r" +"], " ", regex=True, inplace=True)
    
    # Strip leading and trailing whitespaces
    df.loc[:, "Description":] = df.loc[:, "Description":].map(lambda x: x.strip() if isinstance(x, str) else x)
    
    # Replace 'and or' with 'or'
    df.replace(r"and/or", "or", regex=True, inplace=True)

### Cleaning ISIC data

In [3]:
# Read the CSV file
ISIC_df = pd.read_csv("data/ISIC Rev. 4.csv")

# Set the 'Code' column as the index
ISIC_df = ISIC_df.set_index("Code")

# Add a 'Level' column
insert_levels(ISIC_df)

# Add a 'Parent' column
insert_parents(ISIC_df)

# Extra cleaning techniques
default_clean(ISIC_df)

ISIC_df.head()

Unnamed: 0_level_0,Level,Parent,Description
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,1,,"agriculture, forestry and fishing"
01,2,A,"crop and animal production, hunting and relate..."
011,3,01,growing of non-perennial crops
0111,4,011,"growing of cereals (except rice), leguminous c..."
0112,4,011,growing of rice


### Cleaning NACE data

In [4]:
# Read the Excel file
NACE_df = pd.read_excel("data/NACE Rev. 2.xlsx")

# Replace NaN values with empty strings
NACE_df = NACE_df.fillna("")

# Set the 'Code' column as the index
NACE_df = NACE_df.set_index("Code")

# Rename columns
NACE_df = NACE_df.rename(columns={
    "This item includes": "Examples",
    "This item excludes": "Exclusions",
    "Reference to ISIC Rev. 4": "ISIC code"
})

# Concatenate the columns "This item also includes" and "Definition"
NACE_df["Examples"] += " " + NACE_df["This item also includes"]

# Drop columns
NACE_df = NACE_df.drop(columns=["Order", "Rulings", "This item also includes"])

# Extra cleaning techniques
default_clean(NACE_df)

NACE_df.head()

Unnamed: 0_level_0,Level,Parent,Description,Examples,Exclusions,ISIC code
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,1,,"agriculture, forestry and fishing",this section includes the exploitation of vege...,,A
01,2,A,"crop and animal production, hunting and relate...","this division includes two basic activities, n...",agricultural activities exclude any subsequent...,01
01.1,3,01,growing of non-perennial crops,this group includes the growing of non-perenni...,,011
01.11,4,01.1,"growing of cereals (except rice), leguminous c...",this class includes all forms of growing of ce...,"this class excludes: - growing of rice, see 01...",0111
01.12,4,01.1,growing of rice,this class includes: - growing of rice (includ...,,0112


### Cleaning WZ data

In [5]:
# Read the Excel file
WZ_df = pd.read_excel("data/WZ Issue 2008.xls", sheet_name="Content")

# Rename columns
WZ_df = WZ_df.rename(columns={"Code WZ 2008": "Code", "Title": "Description"})

# Set the 'Code' column as the index
WZ_df = WZ_df.set_index("Code")

# Add a 'Parent' column
insert_parents(WZ_df)

# Remove columns
WZ_df = WZ_df.drop(columns="Unit(s) of measure")

# Extra cleaning techniques
default_clean(WZ_df)

WZ_df.head()

Unnamed: 0_level_0,Level,Parent,Description
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,1,,"agriculture, forestry and fishing"
01,2,A,"crop and animal production, hunting and relate..."
01.1,3,01,growing of non-perennial crops
01.11,4,01.1,"growing of cereals (except rice), leguminous c..."
01.11.0,5,01.11,"growing of cereals (except rice), leguminous c..."


### Cleaning SSIC v1 data

In [6]:
# Read the Excel file
SSIC_v1_df = pd.read_excel("data/SSIC 2020 v1.xlsx")

# Drop columns
SSIC_v1_df = SSIC_v1_df.drop(columns=["ISIC Rev. 4 Part", "ISIC Rev. 4 Title"])

# Rename columns
SSIC_v1_df = SSIC_v1_df.rename(columns={
    "SSIC 2020 Title": "Description",
    "SSIC 2020": "Code",
    "ISIC Rev. 4": "ISIC code"
})

# Set the 'Code' column as the index
SSIC_v1_df = SSIC_v1_df.set_index("Code")

# Add a 'Level' column
insert_levels(SSIC_v1_df)

# Extra cleaning techniques
default_clean(SSIC_v1_df)

SSIC_v1_df.head()

Unnamed: 0_level_0,Level,Description,ISIC code
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1111,5,growing of leafy and fruit vegetables,113
1112,5,growing of mushrooms,113
1113,5,growing of root crops,113
1119,5,growing of food crops (non-hydroponics) n.e.c.,111
1119,5,growing of food crops (non-hydroponics) n.e.c.,112


### Cleaning SSIC v2 data

In [7]:
# Read the Excel file
SSIC_v2_df = pd.read_excel("data/SSIC 2020 v2.xlsx")

# Drop columns
SSIC_v2_df = SSIC_v2_df.drop(columns=["Cross References", "Groups Classified Under this Code"])

# Replace '<Blank>' with empty strings
SSIC_v2_df = SSIC_v2_df.replace(to_replace="<Blank>", value="")

# Rename columns
SSIC_v2_df = SSIC_v2_df.rename(columns={
    "SSIC 2020 Title": "Description",
    "SSIC 2020": "Code",
    "Detailed Definitions": "Definition",
    "Examples of Activities Classified Under this Code": "Examples"
})

# Set the 'Code' column as the index
SSIC_v2_df = SSIC_v2_df.set_index("Code")

# Add a 'Level' column
insert_levels(SSIC_v2_df)

# Add a 'Parent' column
insert_parents(SSIC_v2_df)

# Extra cleaning techniques
default_clean(SSIC_v2_df)

SSIC_v2_df.head()

Unnamed: 0_level_0,Level,Parent,Description,Definition,Examples
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,1,,agriculture and fishing,,
01,2,A,agriculture and related service activities,,
011,3,01,"growing of crops, market gardening and horticu...",,
0111,4,011,growing of food crops (non-hydroponics),,
01111,5,0111,growing of leafy and fruit vegetables,this sub-class includes the cultivation of lea...,


In [8]:
from pathlib import Path  

filepath = Path('data/exports/SSIC 2020 v2.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  

SSIC_v2_df.to_csv(filepath)  

## Pre-processing

### Using spaCy

In [9]:
# import spacy
# from spacy import displacy
# from spacy.language import Language

# nlp = spacy.load("en_core_web_sm")

# # Add the function to the pipeline
# config = {"punct_chars": ['•', '\n']}
# nlp.add_pipe("sentencizer", config=config, before="parser")

In [10]:
# # TODO: Apply to range of columns - Everthing after 'Parent' not including numerical data

# text = """This Sub-class includes: 
# •production, collection and distribution of steam and hot water for heating, power and other purposes
# •production and distribution of cooled air supply (e.g. centralised / district cooling)
# •production and distribution of chilled water for cooling purposes"
# """

# def token_filter(token):
#     return not (token.is_stop or token.is_punct)

# def doc_filter(doc):
#     for token in doc:
#         if token.text == "includes":
#             return False
    
#     return True

# def token_list(text):
#     doc = nlp(text)

#     sent_strs = [sent.text.strip().replace("•", "") for sent in doc.sents]
#     docs = [nlp(string) for string in sent_strs]
    
#     tokens = [[token for token in doc if token_filter(token)] for doc in docs if doc_filter(doc)]
#     return tokens

# def tokenise_series(df, column):
#     return pd.Series([token_list(text) for text in df[column]], index=df.index)

# def extract_noun_phrases(text):
#     doc = nlp(text)
    
#     noun_phrases = []
#     for sent in doc.sents:
#         for chunk in sent.noun_chunks:
#             # Filter out common stopwords and short phrases
#             if len(chunk) > 1 and not chunk.text.lower() in ["this", "these", "such"]:
#                 noun_phrases.append(chunk.text)
#     return noun_phrases

In [11]:
# class IndustryCodeType(Enum):
#     ISIC, NACE, WZ, SSIC = range(4)

# class IndustryCode:
#     def __init__(self, type, value):
#         self.type = type
#         self.value = value

# class Company:
#     def __init__(self, name, desc, code):
#         self.name = name
#         self.desc = desc
#         self.code = code