# One off operations during a change of SSIC Code Excel file

## README

Run this code only if there is a change in SSIC Code Excel file, or configurations

### Data Preperation

All data should be stored in the Google Cloud Storage

- data/ssic2020-detailed-definitions.xlsx
  - Please modify config files for the following:
      - Which row the headers are located
      - The SSIC Code column name
      - The SSIC Description column name
      - The SSIC Definitions column name
  - The data logic must be the same, ie. all SSIC codes in the one column, regardless if they belong to 3 digits or 4 digits, and both must be present

- Check config variable below and save it by running all the cells
  - project_id: GCP project ID
  - bucket name
  - model names
  
- Config is saved as config.yaml in the root folder of the bucket in GCS

In [1]:
!pip install transformers langchain

Collecting langchain
  Downloading langchain-0.1.0-py3-none-any.whl (797 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.0/798.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.3-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.0.9 (from langchain)
  Downloading langchain_community-0.0.11-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core<0.2,>=0.1.7 (from langchain)
  Downloading langchain_core-0.1.9-py3-none-any.whl (216 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m216.5/216.5 kB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langsmith<0.1.0,>=0.0.77 (from langchain)
  Downloading langsmith-0

# Embedding Function

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from google.colab import auth
auth.authenticate_user()  # Authenticate with Google Cloud

from google.cloud import storage
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from tqdm import tqdm
import os
import tempfile
import math

class TextEmbedder:
    def __init__(self, model_name, project_id, bucket_name, from_local = False, model_location = None, tokenizer_location = None):
        self.project_id = project_id
        self.model_location = model_location
        self.tokenizer_location = tokenizer_location
        self.bucket_name = bucket_name
        self.model_name = model_name

        if model_name == "textembedding-gecko":
            from vertexai.preview.language_models import TextEmbeddingModel
            self.model = TextEmbeddingModel.from_pretrained(model_name)
        else:
            if from_local:
                self.download_directory_from_gcs(model_location, './model')
                self.download_directory_from_gcs(tokenizer_location, './tokenizer')
                self.model = AutoModel.from_pretrained('./model')
                self.tokenizer = AutoTokenizer.from_pretrained('./tokenizer')
            else:
                # Load model
                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
                self.model = AutoModel.from_pretrained(self.model_name)

                self.client = storage.Client(project=self.project_id)

                # Save the model and tokenizer locally
                self.model.save_pretrained('./model')
                self.tokenizer.save_pretrained('./tokenizer')
                self.upload_directory_to_gcs('./model', model_location)
                self.upload_directory_to_gcs('./tokenizer', tokenizer_location)

    def download_directory_from_gcs(self, source_directory_name, destination_directory_name):
        """Downloads a directory from the bucket."""
        bucket = self.client.get_bucket(self.bucket_name)
        blobs = bucket.list_blobs(prefix=source_directory_name)

        for blob in blobs:
            filename = blob.name.replace('/', '_')
            blob.download_to_filename(os.path.join(destination_directory_name, filename))

    def upload_directory_to_gcs(self, source_directory_name, destination_blob_name):
        """Uploads a directory to the bucket."""
        bucket = self.client.get_bucket(self.bucket_name)
        for root, dirs, files in os.walk(source_directory_name):
            for filename in files:
                blob = bucket.blob(os.path.join(destination_blob_name, filename))
                blob.upload_from_filename(os.path.join(root, filename))

    def save_model(self):

        # Create a temporary directory to save the model and tokenizer
        with tempfile.TemporaryDirectory() as temp_dir:
            local_model_path = f"{temp_dir}/model"
            local_tokenizer_path = f"{temp_dir}/tokenizer"

            # Save model and tokenizer locally
            self.model.save_pretrained(local_model_path)
            self.tokenizer.save_pretrained(local_tokenizer_path)

            # Authenticate with Google Cloud Storage
            gcs_client = storage.Client()

            # Upload model and tokenizer to GCS
            bucket = gcs_client.bucket(bucket_name)
            blob_model = bucket.blob(f"{self.model_location}/model")
            blob_tokenizer = bucket.blob(f"{self.model_location}/tokenizer")

            blob_model.upload_from_filename(local_model_path, content_type="application/octet-stream")
            blob_tokenizer.upload_from_filename(local_tokenizer_path, content_type="application/octet-stream")

    def _average_pool(self, last_hidden_states, attention_mask):
        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

    def encode_text(self, input_texts, batch_size):
        if self.model_name == "textembedding-gecko":
            # Note only 250 items, and limited tokens
            batch_size = 3
            self.embeddings = torch.Tensor([])
            num_batches = math.ceil(len(input_texts) / batch_size)
            for i in range(num_batches):
                batch_texts = input_texts[i*batch_size:min((i+1)*batch_size, len(input_texts))]
                embeddings = self.model.get_embeddings(batch_texts)
                self.embeddings = torch.cat((self.embeddings, torch.stack([torch.tensor(inner_list.values) for inner_list in embeddings])), dim=0)

        else:
            # Tokenize the input texts
            embeddings = []

            # Get the total number of input texts
            total_texts = len(input_texts)

            with tqdm(total=total_texts, desc="Embedding Progress") as pbar:
                with torch.no_grad():
                    for i in range(0, len(input_texts), batch_size):
                        batch_dict = self.tokenizer(input_texts[i:i+batch_size], max_length=1024, padding=True, truncation=True, return_tensors='pt')
                        input_chunk = batch_dict['input_ids']
                        mask_chunk = batch_dict['attention_mask']
                        outputs = self.model(input_ids=input_chunk, attention_mask=mask_chunk)
                        chunk_embeddings = self._average_pool(outputs.last_hidden_state, mask_chunk)
                        embeddings.append(chunk_embeddings)
                        pbar.update(batch_size)

            # Concatenate embeddings from all batches
            embeddings = torch.cat(embeddings, dim=0)
            self.embeddings = embeddings

    def save_embeddings(self, bucket_name, file_name):
        """Save the embeddings to a Google Cloud Storage bucket."""

        # Convert the embeddings to a NumPy array
        embeddings_np = self.embeddings.detach().numpy()

        # Convert the NumPy array to bytes
        embeddings_bytes = embeddings_np.tobytes()

        # Initialize the Google Cloud Storage client
        storage_client = storage.Client(project=self.project_id)

        # Get the bucket
        bucket = storage_client.get_bucket(bucket_name)

        # Create a new blob and upload the embeddings
        blob = storage.Blob(file_name, bucket)
        blob.upload_from_string(embeddings_bytes)

        print(f'Embeddings ({embeddings_np.shape}) uploaded to gs://{bucket_name}/{file_name}')

# Fileloading Function

Only for connecting to google cloud storage!

In [4]:
from google.colab import auth
auth.authenticate_user()  # Authenticate with Google Cloud

from google.cloud import storage
import pandas as pd
import io
import json
import yaml

## File loading
class FileLoader:
    def __init__(self, bucket_name):
         # Authenticate with Google Cloud
        auth.authenticate_user()

        self.bucket_name = bucket_name

        # Initialize a client to access Google Cloud Storage
        storage_client = storage.Client()

        # Get the bucket and blob (object)
        self.bucket = storage_client.bucket(bucket_name)

    def save_json_in_gcs(self, data, file_path):

        blob = self.bucket.blob(file_path)

        # Convert the list to a JSON string
        json_data = json.dumps(data)

        # Upload the JSON data to the blob
        blob.upload_from_string(json_data)


    def load_json_from_gcs(self, file_path):

        blob = self.bucket.blob(file_path)

        # Download the JSON data from the blob
        json_data = blob.download_as_text()

        # Parse the JSON data into a Python list
        data_list = json.loads(json_data)
        return data_list

    def load_df_from_gcs(self, file_path, header = 0):

        blob = self.bucket.blob(file_path)
        content = blob.download_as_text(encoding='latin1')

        # Convert the string content to bytes
        content_bytes = content.encode('latin1')

        # Create a DataFrame from the Excel content
        df = pd.read_excel(io.BytesIO(content_bytes), header = header)
        return df

    def save_df_from_gcs(self, df, file_path):

        # Convert DataFrame to CSV string
        csv_string = df.to_csv(index=False)

        # Upload the CSV string to the blob
        blob = self.bucket.blob(file_path)
        blob.upload_from_filename(csv_string)

    def save_dict_to_yaml_gcs(self, file_name, data):
        # Convert dictionary to YAML format
        yaml_data = yaml.dump(data, default_flow_style=False)

        # Create blob (file) in the bucket
        blob = self.bucket.blob(file_name)

        # Upload YAML data to the blob
        blob.upload_from_string(yaml_data)

    def read_yaml_from_gcs(self, file_name):
        blob = self.bucket.blob(file_name)

        # Download the YAML content as bytes
        yaml_bytes = blob.download_as_bytes()

        # Load the YAML content as a dictionary
        yaml_dict = yaml.safe_load(yaml_bytes)

        return yaml_dict

# Full Pipeline

In [5]:
import re

class SSICProcessor:
    def __init__(self, text_embedder, fileloader, config):
        """
        Class to create all artifacts for SSIC
        args:
            text_embedder (object): TextEmbedder class
            fileloader (object): FileLoader class
            config (dict): Configuration dictionary
        """
        self.text_embedder = text_embedder
        self.fileloader = fileloader
        self.config = config

        self.ssic_df = self.fileloader.load_df_from_gcs(config['dir']['ssic_df_fp'], config['columns']['header_row'])

        self.l2_l3_mapper()

        self.activities = [
        """Primary activities, segments, revenue stream
        Main operations, subsidiaries
        Principal Sectors
        """
        ]

    def remove_digits_and_bullet(self, text):
        """
        Remove digits and bullet from text
        args:
            text (str): text to process
        """
        # Use a regular expression to match digits and the '•' character
        pattern = r'[0-9•]'
        # Use the re.sub function to replace all matches with an empty string
        cleaned_text = re.sub(pattern, '', text)
        return cleaned_text

    def l2_l3_mapper(self):
        """Creates mapping from L2 to L3"""
        # Initialize an empty dictionary
        l2_l3_mapping = {}
        ssic_code_col = self.config['columns']['ssic_code']
        ssic_title_col = self.config['columns']['ssic_desc']

        # Iterate through the DataFrame rows
        for _, row in self.ssic_df.iterrows():
            key = row[ssic_code_col]
            value = row[ssic_title_col]

            # Check if the key is a 4-digit value -- L2
            if len(key) == 4:
                l2_l3_mapping[value] = []
                current_key = value
            elif len(key) == 5: # Check if the key is a 5-digit value -- L3
                l2_l3_mapping[current_key].append(value)
        self.l2_l3_mapping = l2_l3_mapping


    def map_l2_l3(self, rel_l2):
      """Get l2 from l3"""
      if rel_l2:
          rel_l3 = []
          for l2 in rel_l2:
              l3_list = self.l2_l3_mapping[l2]
              for l3 in l3_list:
                 rel_l3.append(l3)
          return rel_l3

    def construct_ssic_lists(self):
        """Construct all json artifacts"""
        # Get columns
        ssic_code_col = self.config['columns']['ssic_code']
        ssic_title_col = self.config['columns']['ssic_desc']

        self.ssic_df[ssic_code_col] = self.ssic_df[ssic_code_col].str.replace(' ', '')
        df = self.ssic_df.copy()

        # Others filter condition
        filter_cond = (df[ssic_title_col].str.lower().str.contains('n.e.c')|df[ssic_title_col].str.lower().str.contains('other'))

        # Get sections for L2, L3 and others
        ssic_l2_other = df[df[ssic_code_col].str.len()==4][filter_cond].reset_index(drop=True)
        ssic_l3 = df[df[ssic_code_col].str.len()==5][~filter_cond].reset_index(drop=True)
        ssic_l3_other = df[df[ssic_code_col].str.len()==5][filter_cond].reset_index(drop=True)

        # For L3 others, it includes both L2 others and L3 others
        l3_others = list(set(self.map_l2_l3(ssic_l2_other[ssic_title_col].tolist()) + ssic_l3_other[ssic_title_col].tolist()))
        #ssic_l3_other = df[df[ssic_title_col].isin(l3_others)]
        ssic_l3 = ssic_l3[~ssic_l3[ssic_title_col].isin(l3_others)]

        # Form lists
        self.l3_list = ssic_l3[ssic_title_col].tolist()
        self.l3_list_others = l3_others


        print("Total number of 5-digit SSIC Codes (non-others): ", len(self.l3_list))
        print("Total number of 5-digit SSIC Codes (others): ",len(self.l3_list_others))

        # Save lists
        self.fileloader.save_json_in_gcs(self.l3_list, config["dir"]["l3_list_fp"])
        self.fileloader.save_json_in_gcs(self.l3_list_others, config["dir"]["l3_list_others_fp"])

        print("Constructed SSIC Artifacts.")

    def generate_df_from_list(self, df, filter_list, column):

        # Filter DataFrame based on the list
        filtered_df = df[df[column].isin(filter_list)]

        # Create a custom sorting based on the order of the list
        filtered_df['Order'] = filtered_df[column].apply(lambda x: filter_list.index(x))

        # Sort the DataFrame based on the custom order
        filtered_df = filtered_df.sort_values(by='Order')

        # Drop the 'Order' column if it's not needed anymore
        filtered_df = filtered_df.drop('Order', axis=1)

        return filtered_df

    def construct_ssic_embeddings(self):
        """Construct all embedding artifacts"""
        # Generate activity embeddings
        text_embedder.encode_text(self.activities,1)
        self.text_embedder.save_embeddings(self.config['bucket_name'], self.config["dir"]["activity_embedding_fp"])

        # Generate L3 others by combining description, definitions and level 2
        df = self.ssic_df.copy()
        ssic_code_col = self.config['columns']['ssic_code']
        ssic_definitions_col = self.config['columns']['ssic_definitions']
        ssic_title_col = self.config['columns']['ssic_desc']

        # Clean df
        df[ssic_definitions_col] = df[ssic_definitions_col].apply(self.remove_digits_and_bullet)

        ## Construct l2_l3_df
        l2_l3_df = pd.DataFrame(list(self.l2_l3_mapping.items()), columns=['l2', 'l3']).explode('l3').reset_index(drop=True)
        l2_l3_df['l2_l3'] = l2_l3_df['l3'] + ', ' + l2_l3_df['l2']
        l2_l3_df = l2_l3_df.drop_duplicates('l3').reset_index(drop=True)

        # Merge dataframes to get L2, L3, Descriptions per entry
        ssic_df_l3 = df.merge(l2_l3_df, left_on = ssic_title_col, right_on='l3', how='right').drop_duplicates('l3').reset_index(drop=True)
        ssic_df_l3['l3_desc'] = ssic_df_l3[[ssic_title_col, ssic_definitions_col, 'l2_l3']].apply(lambda x: str(x[2]) + ', ' + str(x[1]) + ", " + str(x[0]), axis=1)
        ssic_df_l3_others = self.generate_df_from_list(ssic_df_l3, self.l3_list_others, ssic_title_col)
        ssic_df_l3_nonothers = self.generate_df_from_list(ssic_df_l3, self.l3_list, ssic_title_col)

        assert len(ssic_df_l3_nonothers) == len(self.l3_list), print(f"Generate dataframe {len(ssic_df_l3_nonothers)} not of correct length {len(self.l3_list)}.")
        assert len(ssic_df_l3_others) == len(self.l3_list_others), print(f"Generate dataframe (others) {len(ssic_df_l3_others)} not of correct length {len(self.l3_list_others)}")

        # Construct Emebddings
        print("Generating Embeddings for non-others descriptors...")
        text_embedder.encode_text(ssic_df_l3_nonothers['l3_desc'].tolist(),self.config["batch_size"])
        self.text_embedder.save_embeddings(self.config['bucket_name'], self.config["dir"]["l2_l3_embeddings_fp"])

        print("Generating Embeddings for Others descriptors...")
        text_embedder.encode_text(ssic_df_l3_others['l3_desc'].tolist(),self.config["batch_size"])
        self.text_embedder.save_embeddings(self.config['bucket_name'], self.config["dir"]["l2_l3_embeddings_others_fp"])

        print("Constructed Embeddings.")

        #self.l2_embedding_others = text_embedder.encode_text(self.l2_list_others,self.config["batch_size"])
        #self.text_embedder.save_embeddings(self.bucket_name, self.config["dir"]["l2_embedding_others_fp"])

In [6]:
config = {
    "project_id": "double-insight-395609",
    "bucket_name": "acra-ssic-classification",
    "google_model_name": "text-bison",
     #"classifier_model_name": "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7",
   "classifier_model_name": "text-bison",
    "summarizer_model_name": "text-bison",
    "text_embedding_model_name": "textembedding-gecko",
    "evaluation_mode": True,
    #"text_embedding_model_name": "thenlper/gte-large",
    "batch_size": 3,
    "columns":{
        "ssic_code": "SSIC 2020",
        "ssic_desc": "SSIC 2020 Title",
        "ssic_definitions": "Detailed Definitions",
        "header_row": 4
    },
    "dir":{
        "model_location": "model",
        "tokenizer_location": "tokenizer",
        "ssic_df_fp": "data/ssic2020-detailed-definitions.xlsx",
        "l3_list_fp": "data/l3_list.json",
        "l3_list_others_fp": "data/l3_list_others.json",
        "l2_l3_embeddings_fp": "embeddings/l2_l3_embeddings.npy",
        "l2_l3_embeddings_others_fp": "embeddings/l2_l3_embeddings_others.npy",
        "activity_embedding_fp": "embeddings/activity_embedding.npy",
        "fs_dir": "data/fs",
        "output_fp": "data/output/output.csv",
        "label_fp": "data/Company SSIC Code.csv",
        "evaluator_fp": "data/output/evaluator.csv"
    }
}

In [7]:
bucket_name = 'acra-ssic-classification'
fileloader = FileLoader(bucket_name)
fileloader.save_dict_to_yaml_gcs("config.yaml",config)
config = fileloader.read_yaml_from_gcs('config.yaml')
config

{'batch_size': 3,
 'bucket_name': 'acra-ssic-classification',
 'classifier_model_name': 'text-bison',
 'columns': {'header_row': 4,
  'ssic_code': 'SSIC 2020',
  'ssic_definitions': 'Detailed Definitions',
  'ssic_desc': 'SSIC 2020 Title'},
 'dir': {'activity_embedding_fp': 'embeddings/activity_embedding.npy',
  'evaluator_fp': 'data/output/evaluator.csv',
  'fs_dir': 'data/fs',
  'l2_l3_embeddings_fp': 'embeddings/l2_l3_embeddings.npy',
  'l2_l3_embeddings_others_fp': 'embeddings/l2_l3_embeddings_others.npy',
  'l3_list_fp': 'data/l3_list.json',
  'l3_list_others_fp': 'data/l3_list_others.json',
  'label_fp': 'data/Company SSIC Code.csv',
  'model_location': 'model',
  'output_fp': 'data/output/output.csv',
  'ssic_df_fp': 'data/ssic2020-detailed-definitions.xlsx',
  'tokenizer_location': 'tokenizer'},
 'evaluation_mode': True,
 'google_model_name': 'text-bison',
 'project_id': 'double-insight-395609',
 'summarizer_model_name': 'text-bison',
 'text_embedding_model_name': 'textembeddin

In [8]:
text_embedder = TextEmbedder(config["text_embedding_model_name"],
                             config["project_id"],
                             config["bucket_name"],
                             from_local = False,
                             model_location = config["dir"]["model_location"],
                             tokenizer_location = config["dir"]["tokenizer_location"]
                             )

In [9]:
ssic = SSICProcessor(text_embedder, fileloader, config)
ssic.construct_ssic_lists()
ssic.construct_ssic_embeddings()

Total number of 5-digit SSIC Codes (non-others):  624
Total number of 5-digit SSIC Codes (others):  399
Constructed SSIC Artifacts.
Embeddings ((1, 768)) uploaded to gs://acra-ssic-classification/embeddings/activity_embedding.npy
Generating Embeddings for non-others descriptors...
Embeddings ((624, 768)) uploaded to gs://acra-ssic-classification/embeddings/l2_l3_embeddings.npy
Generating Embeddings for Others descriptors...
Embeddings ((399, 768)) uploaded to gs://acra-ssic-classification/embeddings/l2_l3_embeddings_others.npy
Constructed Embeddings.
