### 0. Utilities

#### Package imports

In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

from utils.industry import IndustryStandard
from utils.inference.industry.helpers import default_clean, insert_levels, insert_parents
from utils.inference.industry.io import export_inference_to_csv, load_raw_inference

### 1. Text cleaning

- Manually cleared the print area to remove warnings produced after reading from spreadsheets

In [2]:
dfs = load_raw_inference()

#### Cleaning ISIC data

In [3]:
# Read the CSV file
ISIC_df = dfs[IndustryStandard.ISIC][0]

# Set the 'Code' column as the index
ISIC_df = ISIC_df.set_index("Code")

# Add a 'Level' column
insert_levels(ISIC_df)

# Add a 'Parent' column
insert_parents(ISIC_df)

# Extra cleaning techniques
default_clean(ISIC_df)

dfs[IndustryStandard.ISIC] = ISIC_df

In [4]:
ISIC_df.head()

Unnamed: 0_level_0,Level,Parent,Description
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,1,,"Agriculture, forestry and fishing"
01,2,A,"Crop and animal production, hunting and relate..."
011,3,01,Growing of non-perennial crops
0111,4,011,"Growing of cereals (except rice), leguminous c..."
0112,4,011,Growing of rice


#### Cleaning NACE data

In [5]:
# Read the Excel file
NACE_df = dfs[IndustryStandard.NACE][0]

# Replace NaN values with empty strings
NACE_df = NACE_df.fillna("")

# Set the 'Code' column as the index
NACE_df = NACE_df.set_index("Code")

# Rename columns
NACE_df = NACE_df.rename(columns={
    "This item includes": "Examples",
    "This item excludes": "Exclusions",
    "Reference to ISIC Rev. 4": "ISIC code"
})

# Concatenate the columns "This item also includes" and "Definition"
NACE_df["Examples"] += " " + NACE_df["This item also includes"]

# Drop columns
NACE_df = NACE_df.drop(columns=["Order", "Rulings", "This item also includes"])

# Extra cleaning techniques
default_clean(NACE_df)

dfs[IndustryStandard.NACE] = NACE_df

In [6]:
NACE_df.head()

Unnamed: 0_level_0,Level,Parent,Description,Examples,Exclusions,ISIC code
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,1,,"AGRICULTURE, FORESTRY AND FISHING",This section includes the exploitation of vege...,,A
01,2,A,"Crop and animal production, hunting and relate...","This division includes two basic activities, n...",Agricultural activities exclude any subsequent...,01
01.1,3,01,Growing of non-perennial crops,This group includes the growing of non-perenni...,,011
01.11,4,01.1,"Growing of cereals (except rice), leguminous c...",This class includes all forms of growing of ce...,"This class excludes:\n- growing of rice, see 0...",0111
01.12,4,01.1,Growing of rice,This class includes:\n- growing of rice (inclu...,,0112


#### Cleaning WZ data

In [7]:
# Read the Excel file
WZ_df = dfs[IndustryStandard.WZ][0]

# Rename columns
WZ_df = WZ_df.rename(columns={"Code WZ 2008": "Code", "Title": "Description"})

# Set the 'Code' column as the index
WZ_df = WZ_df.set_index("Code")

# Add a 'Parent' column
insert_parents(WZ_df)

# Remove columns
WZ_df = WZ_df.drop(columns="Unit(s) of measure")

# Extra cleaning techniques
default_clean(WZ_df)

dfs[IndustryStandard.WZ] = WZ_df

In [8]:
WZ_df.head()

Unnamed: 0_level_0,Level,Parent,Description
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,1,,"Agriculture, forestry and fishing"
01,2,A,"Crop and animal production, hunting and relate..."
01.1,3,01,Growing of non-perennial crops
01.11,4,01.1,"Growing of cereals (except rice), leguminous c..."
01.11.0,5,01.11,"Growing of cereals (except rice), leguminous c..."


#### Cleaning SSIC v1 data

In [9]:
# Read the Excel file
SSIC_v1_df = dfs[IndustryStandard.SSIC][0]

# Drop columns
SSIC_v1_df = SSIC_v1_df.drop(columns=["ISIC Rev. 4 Part", "ISIC Rev. 4 Title"])

# Rename columns
SSIC_v1_df = SSIC_v1_df.rename(columns={
    "SSIC 2020 Title": "Description",
    "SSIC 2020": "Code",
    "ISIC Rev. 4": "ISIC code"
})

# Set the 'Code' column as the index
SSIC_v1_df = SSIC_v1_df.set_index("Code")

# Add a 'Level' column
insert_levels(SSIC_v1_df)

# Extra cleaning techniques
default_clean(SSIC_v1_df)

In [10]:
SSIC_v1_df.head()

Unnamed: 0_level_0,Level,Description,ISIC code
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1111,5,Growing of leafy and fruit vegetables,113
1112,5,Growing of mushrooms,113
1113,5,Growing of root crops,113
1119,5,Growing of food crops (non-hydroponics) n.e.c.,111
1119,5,Growing of food crops (non-hydroponics) n.e.c.,112


#### Cleaning SSIC v2 data

In [11]:
# Read the Excel file
SSIC_v2_df = dfs[IndustryStandard.SSIC][1]

# Drop columns
SSIC_v2_df = SSIC_v2_df.drop(columns=["Cross References", "Groups Classified Under this Code"])

# Replace '<Blank>' with empty strings
SSIC_v2_df = SSIC_v2_df.replace(to_replace="<Blank>", value="")

# Rename columns
SSIC_v2_df = SSIC_v2_df.rename(columns={
    "SSIC 2020 Title": "Description",
    "SSIC 2020": "Code",
    "Detailed Definitions": "Definition",
    "Examples of Activities Classified Under this Code": "Examples"
})

# Set the 'Code' column as the index
SSIC_v2_df = SSIC_v2_df.set_index("Code")

# Add a 'Level' column
insert_levels(SSIC_v2_df)

# Add a 'Parent' column
insert_parents(SSIC_v2_df)

# Extra cleaning techniques
default_clean(SSIC_v2_df)

dfs[IndustryStandard.SSIC] = SSIC_v2_df

In [12]:
SSIC_v2_df.head()

Unnamed: 0_level_0,Level,Parent,Description,Definition,Examples
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,1,,AGRICULTURE AND FISHING,,
01,2,A,AGRICULTURE AND RELATED SERVICE ACTIVITIES,,
011,3,01,"GROWING OF CROPS, MARKET GARDENING AND HORTICU...",,
0111,4,011,Growing of Food Crops (Non-Hydroponics),,
01111,5,0111,Growing of leafy and fruit vegetables,This sub-class includes the cultivation of lea...,


### 2. Exporting to CSV

In [13]:
export_inference_to_csv(dfs)