In [9]:
!pip uninstall tensorflow-probability -y
!pip install fastapi==0.100.1
!pip install gradio==4.5.0
!pip install PyMuPDF transformers[sentencepiece] langchain
!pip install typing-extensions==4.6.0

Collecting typing-extensions>=4.5.0 (from fastapi==0.100.1)
  Using cached typing_extensions-4.9.0-py3-none-any.whl (32 kB)
Installing collected packages: typing-extensions
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.5.0
    Uninstalling typing_extensions-4.5.0:
      Successfully uninstalled typing_extensions-4.5.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
lida 0.0.10 requires kaleido, which is not installed.[0m[31m
[0mSuccessfully installed typing-extensions-4.9.0
Collecting typing-extensions==4.6.0
  Downloading typing_extensions-4.6.0-py3-none-any.whl (30 kB)
Installing collected packages: typing-extensions
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.9.0
    Uninstalling typing_extensions-4.9.0:
      Successfully uninstalled typing_extensions

In [16]:
import gradio

In [3]:
import warnings
warnings.filterwarnings("ignore")

# Function: Fileload

In [3]:
def keep_text_until_pattern(text, pattern):
    # Case-insensitive search for the pattern
    index = text.lower().find(pattern.lower())

    # Check if the pattern is found
    if index != -1:
        # Keep the text until the pattern
        kept_text = text[:index]
        return kept_text
    else:
        # Return the original text if the pattern is not found
        return text

In [4]:
from google.colab import auth
auth.authenticate_user()  # Authenticate with Google Cloud

from google.cloud import storage
import pandas as pd
import io
import json
import yaml
import fitz
import codecs
import re

## File loading
class FileLoader:
    def __init__(self, bucket_name):
        # Authenticate with Google Cloud
        auth.authenticate_user()

        self.bucket_name = bucket_name

        # Initialize a client to access Google Cloud Storage
        self.storage_client = storage.Client()

        # Get the bucket and blob (object)
        self.bucket = self.storage_client.bucket(bucket_name)

    def save_json_in_gcs(self, data, file_path):

        blob = self.bucket.blob(file_path)

        # Convert the list to a JSON string
        json_data = json.dumps(data)

        # Upload the JSON data to the blob
        blob.upload_from_string(json_data)


    def load_json_from_gcs(self, file_path):

        blob = self.bucket.blob(file_path)

        # Download the JSON data from the blob
        json_data = blob.download_as_text()

        # Parse the JSON data into a Python list
        data_list = json.loads(json_data)
        return data_list

    def load_df_from_gcs(self, file_path, header = 0):

        blob = self.bucket.blob(file_path)
        content = blob.download_as_text(encoding='latin1')

        # Convert the string content to bytes
        content_bytes = content.encode('latin1')

        # Create a DataFrame from the Excel content
        df = pd.read_excel(io.BytesIO(content_bytes), header = header)
        return df

    def load_csv_from_gcs(self, file_path):

        blob = self.bucket.blob(file_path)
        content = blob.download_as_text(encoding='latin1')

        # Convert the string content to bytes
        content_bytes = content.encode('latin1')

        # Create a DataFrame from the Excel content
        df = pd.read_csv(io.BytesIO(content_bytes))
        return df

    def save_df_to_gcs(self, df, file_path):

        # Convert DataFrame to CSV string
        csv_string = df.to_csv(index=False)

        # Upload the CSV string to the blob
        blob = self.bucket.blob(file_path)
        blob.upload_from_string(csv_string, 'text/csv')

    def read_pdf_from_gcs(self, file_name, exclude_strings):
        print(f"Processing: {file_name}")
        blob = self.bucket.blob(file_name)
        pdf_content = blob.download_as_bytes()

        doc = fitz.open(stream=pdf_content, filetype="pdf")
        concatenated_text = []
        important_text = ""
        length = 0
        num_pages = 0
        for page_num in range(doc.page_count):
            if page_num >=2:
                page = doc.load_page(page_num)
                page_text = page.get_text()
                num_pages += 1
                # Check if any of the exclude_strings are present on the page
                if (re.search(re.compile(r'note(s)?\s*to\s*(the)?\s*financial\s*statement(s)?', re.IGNORECASE), page_text)) and ("activit" in page_text.lower()):
                    if not re.compile(r'^content\(s\)[ \t\n\r]+$').findall(page_text):
                        page_text = keep_text_until_pattern(page_text, "significant accounting")
                        important_text = important_text + "/n/n/n"+  page_text
                        #length+=len(page_text)
                        continue
                if any(exclude_str in page_text.lower() for exclude_str in exclude_strings):
                    if "activit" not in page_text.lower():
                        continue  # Skip this page if any exclude_string is found
                concatenated_text.append(page_text)
        #print(f"Average length of a page is : {length/num_pages}")
        doc.close()
        if len(important_text) >= 8000:
            important_text = important_text[:8000]
        return '\n'.join(concatenated_text), important_text

    def read_yaml_from_gcs(self, file_name):
        blob = self.bucket.blob(file_name)

        # Download the YAML content as bytes
        yaml_bytes = blob.download_as_bytes()

        # Load the YAML content as a dictionary
        yaml_dict = yaml.safe_load(yaml_bytes)

        return yaml_dict

    def save_dict_to_yaml_gcs(self, file_name, data):
        # Convert dictionary to YAML format
        yaml_data = yaml.dump(data, default_flow_style=False)

        # Create blob (file) in the bucket
        blob = self.bucket.blob(file_name)

        # Upload YAML data to the blob
        blob.upload_from_string(yaml_data)

    def load_embeddings(self, source_blob_name, model):
        # Get the blob and download the embeddings
        blob = storage.Blob(source_blob_name, self.storage_client.get_bucket(self.bucket_name))
        embeddings_bytes = blob.download_as_bytes()

        # Convert the bytes back to a NumPy array
        embeddings_np = np.frombuffer(embeddings_bytes, dtype=np.float32)

        # Assuming that embeddings are in the float32 format, adjust the data type accordingly
        # embeddings_np = np.frombuffer(embeddings_bytes, dtype=np.float64)  # for float64

        # Convert the NumPy array to a PyTorch tensor
        tensor = torch.from_numpy(embeddings_np)
        if (len(tensor.shape) == 1) and (model == "textembedding-gecko"):
            tensor = tensor.reshape(-1, 768)
        elif len(tensor.shape) == 1:
            tensor = tensor.reshape((-1,1024))

        return tensor

    def list_files(self,directory_path):
        blobs = self.storage_client.list_blobs(self.bucket_name, prefix = directory_path)
        files = [blob.name for blob in blobs]
        return files

# Function: Encoder

In [5]:
from google.colab import auth
auth.authenticate_user()  # Authenticate with Google Cloud

from google.cloud import storage
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from tqdm import tqdm
import os
import tempfile
import math
import time

class TextEmbedder:
    def __init__(self, model_name, project_id, bucket_name, from_local = False, model_location = None, tokenizer_location = None):
        self.project_id = project_id
        self.model_location = model_location
        self.tokenizer_location = tokenizer_location
        self.bucket_name = bucket_name
        self.client = storage.Client(project=self.project_id)
        self.model_name = model_name

        if model_name == "textembedding-gecko":
            from vertexai.preview.language_models import TextEmbeddingModel
            self.model = TextEmbeddingModel.from_pretrained(model_name)
        else:
            if from_local:
                self.download_directory_from_gcs(model_location, './model')
                self.download_directory_from_gcs(tokenizer_location, './tokenizer')
                self.model = AutoModel.from_pretrained('./model')
                self.tokenizer = AutoTokenizer.from_pretrained('./tokenizer')
                print("Embedding Model Loaded")
            else:
                # Load model

                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
                self.model = AutoModel.from_pretrained(self.model_name)

                # Save the model and tokenizer locally
                self.model.save_pretrained('./model')
                self.tokenizer.save_pretrained('./tokenizer')
                self.upload_directory_to_gcs('./model', model_location)
                self.upload_directory_to_gcs('./tokenizer', tokenizer_location)

    def download_directory_from_gcs(self, source_directory_name, destination_directory_name):
        """Downloads a directory from the bucket."""
        # Get the GCS bucket
        bucket = self.client.get_bucket(self.bucket_name)

        # List all files in the specified folder
        blobs = bucket.list_blobs(prefix=source_directory_name)

        # Create a temporary folder in Colab
        os.makedirs(destination_directory_name, exist_ok=True)

        # Copy files from GCS to the temporary folder in Colab
        for blob in blobs:
            if not blob.name.endswith('/'):
                destination_blob_pathname = os.path.join(destination_directory_name, os.path.basename(blob.name))
                blob.download_to_filename(destination_blob_pathname)

    def upload_directory_to_gcs(self, source_directory_name, destination_blob_name):
        """Uploads a directory to the bucket."""
        bucket = self.client.get_bucket(self.bucket_name)
        for root, dirs, files in os.walk(source_directory_name):
            for filename in files:
                blob = bucket.blob(os.path.join(destination_blob_name, filename))
                blob.upload_from_filename(os.path.join(root, filename))

    def _average_pool(self, last_hidden_states, attention_mask):
        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

    def encode_text(self, input_texts, batch_size):
        if self.model_name == "textembedding-gecko":
            # Note only 5 items, and limited tokens
            batch_size = 3
            self.embeddings = torch.Tensor([])
            num_batches = math.ceil(len(input_texts) / batch_size)
            for i in range(num_batches):
                batch_texts = input_texts[i*batch_size:min((i+1)*batch_size, len(input_texts))]
                embeddings = self.model.get_embeddings(batch_texts)
                self.embeddings = torch.cat((self.embeddings, torch.stack([torch.tensor(inner_list.values) for inner_list in embeddings])))
                time.sleep(2.0 + np.random.uniform(0.1, 1.00))
            return self.embeddings
        else:
            # Tokenize the input texts
            embeddings = []

            # Get the total number of input texts
            total_texts = len(input_texts)

            with tqdm(total=total_texts, desc="Embedding Progress") as pbar:
                with torch.no_grad():
                    for i in range(0, len(input_texts), batch_size):
                        batch_dict = self.tokenizer(input_texts[i:i+batch_size], max_length=1024, padding=True, truncation=True, return_tensors='pt')
                        input_chunk = batch_dict['input_ids']
                        mask_chunk = batch_dict['attention_mask']
                        outputs = self.model(input_ids=input_chunk, attention_mask=mask_chunk)
                        chunk_embeddings = self._average_pool(outputs.last_hidden_state, mask_chunk)
                        embeddings.append(chunk_embeddings)
                        pbar.update(batch_size)

            # Concatenate embeddings from all batches
            embeddings = torch.cat(embeddings, dim=0)
            self.embeddings = embeddings
            return embeddings

    def save_embeddings(self, bucket_name, file_name):
        """Save the embeddings to a Google Cloud Storage bucket."""

        # Convert the embeddings to a NumPy array
        embeddings_np = self.embeddings.detach().numpy()

        # Convert the NumPy array to bytes
        embeddings_bytes = embeddings_np.tobytes()

        # Initialize the Google Cloud Storage client
        storage_client = storage.Client(project=self.project_id)

        # Get the bucket
        bucket = storage_client.get_bucket(bucket_name)

        # Create a new blob and upload the embeddings
        blob = storage.Blob(file_name, bucket)
        blob.upload_from_string(embeddings_bytes)

        print(f'Embeddings uploaded to gs://{bucket_name}/{file_name}')

# Function: Sim Search

In [6]:
## Similarity search class
import numpy as np

class SimSearch:
    """Search for similar documents in a corpus."""
    def __init__(self, elbow_method = False, k=5, min_k = 3, max_k = 7):
        self.elbow_method = elbow_method
        self.k = k
        self.min_k = min_k
        self.max_k = max_k

    def sim_score(self, vec1, vec2):
        if len(vec1.shape) == 1:
            # If vec1 is a single vector, reshape it to (1, 1024)
            vec1 = vec1.reshape(1, -1)
        if len(vec2.shape) == 1:
            # If vec2 is a single vector, reshape it to (1, 1024)
            vec2 = vec2.reshape(1, -1)

        x, _ = vec1.shape
        y, _ = vec2.shape

        similarity_matrix = np.zeros((x, y))

        for i in range(x):
            for j in range(y):
                dot_product = np.dot(vec1[i], vec2[j])
                norm_vec1 = np.linalg.norm(vec1[i])
                norm_vec2 = np.linalg.norm(vec2[j])

                similarity_matrix[i][j] = dot_product / (norm_vec1 * norm_vec2)
        return similarity_matrix

    def best_k(self, sorted_scores):
        # Calculate differences between consecutive elements
        differences = [sorted_scores[i] - sorted_scores[i + 1] for i in range(len(sorted_scores) - 1)]

        # Find the index of the maximum difference
        sharp_drop_index = differences.index(max(differences))

        # Corresponding k value
        k_value = sharp_drop_index + 1
        return k_value

    def search_k_sim(self, vec1, vec2):
        # Calculate similarity scores
        similarity_matrix = self.sim_score(vec1, vec2).reshape(-1)
        sorted_scores = np.sort(-similarity_matrix)*-1.0
        if self.elbow_method:
            k_value = self.best_k(sorted_scores)
            k_value = max(min(self.max_k, k_value), self.min_k)
        else:
            k_value = self.k

        # Sort the similarity scores in descending order
        sorted_indices = np.argsort(-similarity_matrix)[:k_value]
        sorted_scores = sorted_scores[:k_value]

        return sorted_indices, sorted_scores

# Function: FS Summarizer

In [7]:
# User LLM for Summarization

import time
import vertexai
#from vertexai.preview.language_models import TextGenerationModel
from vertexai.preview.generative_models import GenerativeModel, ChatSession

from vertexai.preview.generative_models import (
    HarmCategory,
    HarmBlockThreshold )
from google.cloud.aiplatform_v1beta1.types.content import SafetySetting

from langchain.text_splitter import RecursiveCharacterTextSplitter
import numpy as np

class FsSummarizer:
    def __init__(self,
                 project_id,
                 model_name,
                 max_decode_steps,
                 temperature,
                 top_p,
                 top_k,
                 chunk_size,
                 chunk_overlap,
                 text_embedder,
                 activity_embedding,
                 k_sim,
                 location= "asia-southeast1",
                 ):
        self.project_id = project_id
        self.model_name = model_name
        self.temperature = temperature
        self.max_decode_steps = max_decode_steps
        self.top_p = top_p
        self.top_k = top_k
        vertexai.init(project=project_id, location=location)

        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len
            )
        self.text_embedder = text_embedder
        self.sim_searcher = SimSearch(elbow_method = False, k = k_sim)
        self.activity_embedding = activity_embedding


    def chunk(self, text):
        chunks = self.text_splitter.split_text(text=text)
        return chunks

    def summarize(self, desc) :
        model = GenerativeModel("gemini-pro")
        #chat = model.start_chat()
        """Summarize using a Large Language Model."""

        prompt = f'''
        The description of the whole group of the main or parent company, the Company, extracted from its annual statement, which may include its subsidiaries and operations is as follows:
        ```
        {desc}
        ```

        Extract the main activity of the main or parent company (not the whole group, nor its subsidiaries), excluding any discontinued operations, joint ventures and subsidiaries, from the above description of the company.
        During the extraction, you must step by step think and strictly follow the following rules:
        - Parent / main company is the company which holds all the subsidiaries, joint ventures and operations.
        - Ignore all activities in the subsidiaries, joint ventures and discontinued operations. Do not output these activities.
        - If the main or parent company's primary activity is solely holding  or 'investment holding', the primary activity is a holding or 'investment holding'.
        - If the main or parent company's primary activity is holding company or 'investment holding' with provision of management services, the primary activity is a holding company/investment holding.
        - If there are other primary activities besides holding or 'investment holding' (other than provision of management services) for the main or parent company ( excluding all its subsidiaries and joint ventures), identify the other activities as primary activities
        - If multiple primary activities are stated, you must strictly choose only one major activity of the main/parent company (not its subsidiaries, nor the whole group). For example, for 'financing business and provision of corporate advisory services' is considered as two activities; Choose the dominant activity.
        - If the primary activity identified above is not holding or 'investment holding', explain succinctly this primary activity, with examples of goods and services if possible, in two short sentences.
        - If the primary activity identified above is holding or 'investment holding', do not provide examples or explanation.

        For example, if the primary activity of the main or parent company is both 'investment holding' and 'hotel management', only output the 'hotel management' portion.
        If the primary activity of the main or parent company is 'holding' and its subsidiary is 'mining', output 'holding company'.
        If the primary activity of the main or parent company is 'holding' and its joint venture is 'manufacturing', output 'holding company'.

        You must then look through all the activities extracted, and output the main primary activity (excluding subsidiaries and joint ventures) according to the description.
        Output concisely with elaboration. Your output must consider but not elaborate on the rules above.
        '''
        #model = TextGenerationModel.from_pretrained(self.model_name)
        """
        response = model.predict(
            prompt,
            temperature=self.temperature,
            max_output_tokens=self.max_decode_steps,
            top_k=self.top_k,
            top_p=self.top_p
            )
        """
        response = model.generate_content(
            prompt,
            generation_config={
                "max_output_tokens": 2048,
                "temperature": self.temperature,
                "top_p": self.top_p,
            }
        )
        safety_settings = [
            SafetySetting(
                category=HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
                threshold=HarmBlockThreshold.BLOCK_NONE,
            )
        ]
        time.sleep(2.0 + np.random.uniform(0.1, 0.2))
        if response:
            try:
                return response.text.replace('main or parent', '').replace('main', '').replace('parent', '').replace('  ', ' ').strip()
            except:
                return desc

    def run_summarizer(self, text, important_text, batch_size):
        chunks = self.chunk(text)

        if len(important_text) < 2000:
            start_time = time.time()
            print('Embed Chunks...')
            chunk_embedding = self.text_embedder.encode_text(chunks, batch_size)
            print(f'...in {np.round(time.time()-start_time,3)}s')

            print("Searching relevant sections...")
            sorted_indices, _ = self.sim_searcher.search_k_sim(self.activity_embedding, chunk_embedding)

            print("Chunking financial statements...")
            relevant_chunks_list = [chunks[i] for i in sorted_indices]
            relevant_chunks = ' '.join(relevant_chunks_list)
            if len(important_text)>0:
                relevant_chunks = important_text + relevant_chunks
        else:
            relevant_chunks = important_text
        print("Calling LLM to summarize pdf...")
        summarized_desc = self.summarize(relevant_chunks)

        return relevant_chunks, summarized_desc

    def explain(self, desc, summary):
        model = GenerativeModel("gemini-pro")
        chat = model.start_chat()
        """Perform explanaiton of summary using a Large Language Model."""
        prompt = f'''
        The description of the company is as follows:
        ```
        {desc}
        ```

        The primary activity extracted is as follows: {summary}

        Extract the relevant sentences or paragraphs from the description of the company which is relevant to the primary activity extracted.
        Pick at least 3 sentences or paragraphs and as many as possible.
        '''

        """
        model = TextGenerationModel.from_pretrained(self.model_name)

        response = model.predict(
            prompt,
            temperature=self.temperature,
            max_output_tokens=self.max_decode_steps,
            top_k=self.top_k,
            top_p=self.top_p
            )
        """
        try:
            response = chat.send_message(prompt)
            time.sleep(2.0 + np.random.uniform(0.1, 0.2))
            return response.text
        except:
            return ''

# Function: Classification Using Prompt

In [8]:
## Using Palm2 for classification
import vertexai
#from vertexai.preview.language_models import TextGenerationModel
from vertexai.preview.generative_models import GenerativeModel, ChatSession

import re
from transformers import pipeline
import torch

class Classifier:
    def __init__(self, project_id, model_name, max_decode_steps, temperature, top_p, top_k, location= "asia-southeast1"):
        self.project_id = project_id
        self.model_name = model_name

        if self.model_name in ("text-bison", "text-bison-001"):
            self.temperature = temperature
            self.max_decode_steps = max_decode_steps
            self.top_p = top_p
            self.top_k = top_k
            vertexai.init(project=project_id, location=location)

        else:
            self.model = pipeline("zero-shot-classification", model=self.model_name)

    def identify_holding(self, description):
        model = GenerativeModel("gemini-pro")
        chat = model.start_chat()
        prompt = f'''
        This is a description of the company:
        ```
        {description}
        ```

        Is the primary, principle or main activities of the company is solely a holding or 'investment holding', without any other activities in non-subsidiaries operations (except provision of management services)? Evaluate based on all the details (not just one sentence) in the description.
        You must consider the following when answering the question:
        - Ignore all activities in the subsidiaries and discontinued operations.
        - Holding or 'investment holding' companies of a group of subsidiary corporations and whose principal activity is owning the group. The holding companies in this Sub-class generally do not provide any other service to the businesses in which the equity is held, i.e. they do not administer or manage other units.
        - Holding or 'investment holding' companies own other companies. Those owning or investing in other assets like properties are not holding companies.
        - The company is a holding only if the description involves holding or 'investment holding', and does not contain any other activities (except management services and investment trading).
        - If the main/parent company's description depicts solely a holding or 'investment holding' company with no other activities (except provision of management services and investment trading), the primary activity is a holding company/investment holding.
        - If there are other types of activities besides holding or 'investment holding' (other than provision of management services), the company is strictly not a holding company.
        - The following companies (not exhaustive list) are not holding or 'investment holding' companies even if one of their primary activities is also holding/holding investment companies (this list is not exhaustive):
            - Hotel or hotel management
            - Real estate
            - Bank
            - Funds
            - Finance companies
            - Property investment
        - For example, if the company's description is "The company is an investment holding company. The company is a manufacturing company", the company is not a holding company.

        Do not explain.
        '''
        """
        model = TextGenerationModel.from_pretrained(self.model_name)
        response = model.predict(
            prompt,
            temperature=self.temperature,
            max_output_tokens=self.max_decode_steps,
            top_k=self.top_k,
            top_p=self.top_p,)
        """
        response = chat.send_message(prompt)
        response = response.text.strip()
        response = response[:min(len(response), 3)].lower()

        if response=="yes":
            time.sleep(8.0 + np.random.uniform(0.1, 1.00))
            return "Other holding companies"

    def classify(self, description, categories, categories_desc= '', others_condition = '') :
        """Predict using a Large Language Model."""
        categories = [item for item in categories if 'holding' not in item.lower()]
        model = GenerativeModel("gemini-pro")
        #chat = model.start_chat()
        if self.model_name in ("text-bison", "text-bison-001", "gemini-pro"):
            all_cat = "\n".join(categories)

            prompt = f'''
            This is a description of the company:
            ```
            {description}
            ```
            Classify the primary activity of the company into one of the following `category`, each separated by a line break:
            ```
            {all_cat}
            ```

            You must consider the definition of the category when making the classification:
            ```
            {categories_desc}
            ```

            You must follow these rules when making the classification:
            - You must pick the most dominant primary activity of the company for classification and rely on the definition of the category to make the classification
            - Ignore all information about the company's subsidiaries and discontinued operations in the description.
            - If the primary activity of the company focuses on tradinig or sales, and not focused on manufactuing, then the output `category` must not be related to manufacturing or milling.
            - If the company's primary activity is not only a holding company or not only an investment holding, strictly do not choose any options regarding holding companies, or Bank/Financial holding companies.
            {others_condition}
            - Double check to ensure that the output is in the `category` above.
            Strictly output one `category` from above, without any explanation or preambles.
            '''
            """
            model = TextGenerationModel.from_pretrained(self.model_name)
            response = model.predict(
                prompt,
                temperature=self.temperature,
                max_output_tokens=self.max_decode_steps,
                top_k=self.top_k,
                top_p=self.top_p
                )
            """
            #response = chat.send_message(prompt)
            try:
                response = model.generate_content(
                    prompt,
                    generation_config={
                        "temperature": self.temperature,
                        "top_p": self.top_p,
                    }
                )
                safety_settings = [
                    SafetySetting(
                        category=HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
                        threshold=HarmBlockThreshold.BLOCK_NONE,
                    )
                ]
                time.sleep(10.0 + np.random.uniform(0.1, 1.00))
                return response.text.strip()
            except:
                return ""
        else:

            output = self.model(description, categories, multi_label=False)
            return output['labels'][np.argmax(output['scores'])], np.max(output['scores'])


    def compare(self, description, pred, pred_others, cat_desc):
        """Predict using a Large Language Model."""
        if self.model_name in ("text-bison", "text-bison-001", "gemini-pro"):
            if ('n.e.c' in pred_others.lower()) or ('not elsewhere classified' in pred_others.lower()):
                prompt = f'''This is a description of the company:
                ```
                {description}
                ```

                Determine if the following is a good label for the primary activity of the company: {pred}
                You must follow these rules when seleting the choices:
                - If the company's primary activity is not only a holding company or not only an investment holding, the the labels regarding holding companies, or bank/holding company is not suitable.

                Output Yes or No only, without preamble or explanation.
                '''
                model = GenerativeModel("gemini-pro")
                chat = model.start_chat()
                response = chat.send_message(prompt)

                """
                model = TextGenerationModel.from_pretrained(self.model_name)
                response = model.predict(
                    prompt,
                    temperature=self.temperature,
                    max_output_tokens=self.max_decode_steps,
                    top_k=self.top_k,
                    top_p=self.top_p,)
                """
                response = response.text.strip()
                if response=="Yes":
                    return pred
                elif response == "No":
                    return pred_others

            prompt = f'''This is a description of the company:
            ```
            {description}
            ```

            Select from one of the choices below to label the primary activity of the company using the description above.
            Choice 1: ``` {pred}```

            Choice 2: ```{pred_others}```

            While selecting the choices, consider the description of each choice:
            ```{cat_desc}```

            You must follow these rules when seleting the choices:
            - If the company's is primarily a financing company, then select the choice 'Finance companies (with deposit taking functions)'.
            Strictly output without any explanation or preambles.
            '''
            model = GenerativeModel("gemini-pro")
            chat = model.start_chat()
            response = chat.send_message(prompt)
            """
            model = TextGenerationModel.from_pretrained(self.model_name)
            response = model.predict(
                prompt,
                temperature=self.temperature,
                max_output_tokens=self.max_decode_steps,
                top_k=self.top_k,
                top_p=self.top_p,)
            """

            cleaned_text = re.sub(r'\d\.', '', response.text).strip()
            cleaned_text = re.sub(r'Choice \d\.', '', cleaned_text).strip()
            return cleaned_text
        else:
            output = self.model(description, [pred, pred_others], multi_label=False)
            return output['labels'][np.argmax(output['scores'])]

# Function: Evaluator

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

class Evaluator:
    def __init__(self, label, predicted):
        """IMPORTANT: there must be the same number of files in the label csv and number of pdf files"""
        self.evaluate_df = pd.concat([predicted, label], axis=1)
        #self.evaluate_df["SSIC Code"] = self.evaluate_df["SSIC Code"].astype(str)
        #self.evaluate_df["SSIC Code (Predicted)"] = self.evaluate_df["SSIC Code (Predicted)"].astype(str)
        all_classes = set(self.evaluate_df["SSIC Code"].values).union(set(self.evaluate_df["SSIC Code (Predicted)"].values))
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(list(all_classes))

    def evaluate(self):

        ground_truth = self.label_encoder.transform(self.evaluate_df["SSIC Code"]).tolist()
        predicted_labels = self.label_encoder.transform(self.evaluate_df["SSIC Code (Predicted)"]).tolist()
        accuracy = accuracy_score(ground_truth, predicted_labels)
        precision = precision_score(ground_truth, predicted_labels, average='micro')
        recall = recall_score(ground_truth, predicted_labels, average='micro')
        f1 = f1_score(ground_truth, predicted_labels, average='micro')

        return self.evaluate_df, accuracy, precision, recall, f1


# Full Pipeline

In [10]:
# Define Full Pipeline for mulitple files

def classify_one_company(fileloader,
                         file_name,
                         summarizer,
                         batch_size,
                         classifier,
                         activity_embedding):
    # Summarize FS
    ## Read FS
    exclude_strings = [
        'auditor',
        "director",
        "accounting policies",
        "corporate governance",
        "lease liabilities",
        "sustainability",
        "consolidated statement of cash flows",
        "financial risk",
        "dividend"
        ]

    fs, important_text = fileloader.read_pdf_from_gcs(file_name, exclude_strings)
    ## Perform Summary
    relevant_chunks, summarized_desc = summarizer.run_summarizer(fs, important_text, batch_size)
    summary_explanation = summarizer.explain(summarized_desc, relevant_chunks)

    print("Summary extracted: ", summarized_desc)
    ## Perform Classification
    ssic_code, ssic_label, model_score = classifier.classify(summarized_desc)
    return ssic_code, ssic_label, summarized_desc, summary_explanation, model_score

In [11]:
def classify_from_gcs(bucket_name):
    # Load configs

    fileloader = FileLoader(bucket_name)
    config = fileloader.read_yaml_from_gcs("config.yaml")

    project_id = config["project_id"]
    #others_score_threshold = config["others_score_threshold"]
    text_embedding_model_name =  config["text_embedding_model_name"]
    classifier_model_name = config["classifier_model_name"]
    summarizer_model_name = config["summarizer_model_name"]
    google_model_name = config["google_model_name"]

    batch_size = config["batch_size"]

    # Load directories
    model_location = config["dir"]["model_location"]
    tokenizer_location = config["dir"]["tokenizer_location"]
    ssic_df_fp = config["dir"]["ssic_df_fp"]
    l3_list_fp = config["dir"]["l3_list_fp"]
    l3_list_others_fp = config["dir"]["l3_list_others_fp"]
    activity_embedding_fp = config["dir"]["activity_embedding_fp"]
    l2_l3_embeddings_fp = config["dir"]["l2_l3_embeddings_fp"]
    l2_l3_embeddings_others_fp = config["dir"]["l2_l3_embeddings_others_fp"]
    fs_dir = config["dir"]["fs_dir"]
    output_fp = config["dir"]["output_fp"]
    label_fp = config["dir"]["label_fp"]
    evaluator_fp = config["dir"]["evaluator_fp"]

    # Load artifacts
    ssic_df = fileloader.load_df_from_gcs(ssic_df_fp, header = config['columns']["header_row"])
    l3_desc_list = fileloader.load_json_from_gcs(l3_list_fp)
    l3_desc_list_others = fileloader.load_json_from_gcs(l3_list_others_fp)
    l2_l3_embeddings = fileloader.load_embeddings(l2_l3_embeddings_fp, text_embedding_model_name)
    l2_l3_embeddings_others = fileloader.load_embeddings(l2_l3_embeddings_others_fp, text_embedding_model_name)
    activity_embedding = fileloader.load_embeddings(activity_embedding_fp, text_embedding_model_name)
    label = fileloader.load_csv_from_gcs(label_fp)

    assert len(l3_desc_list) == len(l2_l3_embeddings), print(f"Embeddings (non-others) {len(l2_l3_embeddings)} are not the same length as the list {len(l3_desc_list)}")
    assert len(l3_desc_list_others) == len(l2_l3_embeddings_others), print(f"Embeddings {len(l2_l3_embeddings_others)} are not the same length as the list {len(l3_desc_list_others)}")

    # Load classes
    text_embedder = TextEmbedder(text_embedding_model_name,
                                  from_local = True,
                                  project_id=project_id,
                                  bucket_name=bucket_name,
                                  model_location = model_location,
                                  tokenizer_location = tokenizer_location)

    summarizer = FsSummarizer(
        project_id,
        summarizer_model_name,
        max_decode_steps = 256,
        temperature = 0.0,
        top_p = 1.0,
        top_k = 10,
        chunk_size = 1000,
        chunk_overlap = 50,
        text_embedder= text_embedder,
        activity_embedding= activity_embedding,
        k_sim = 5,
        location= "asia-southeast1"
        )

    classifier_google = Classifier(project_id,
                            google_model_name,
                            max_decode_steps = 256,
                            temperature =0.0,
                            top_p = 1.0,
                            top_k = 10,
                            location= "asia-southeast1"
                            )

    classifier = Classifier(project_id,
                            classifier_model_name,
                            max_decode_steps = 256,
                            temperature =0.0,
                            top_p = 1.0,
                            top_k = 10,
                            location= "asia-southeast1"
                            )

    ssic_classifier = SSICClassifier(
        classifier_google,
        classifier,
        ssic_df,
        text_embedder,
        #others_score_threshold,
        l3_desc_list,
        l3_desc_list_others,
        l2_l3_embeddings,
        l2_l3_embeddings_others,
        config
        )
    # Loops through all files in directory
    all_fs = fileloader.list_files(fs_dir)

    assert (len(all_fs)-1) == len(label), print(f"You must have same number of companies in the fs folder ({len(all_fs)-1}) as in the  Company SSIC Code.csv ({len(label)})")

    ssic_labels = pd.DataFrame()

    for file_name in all_fs:
        if file_name.endswith(".pdf"):

            ssic_code, ssic_label, summarized_desc, summary_explanation, model_score = classify_one_company(
                fileloader,
                file_name,
                summarizer,
                batch_size,
                ssic_classifier,
                activity_embedding
                )
            company = file_name.split(".")[0].split('/')[-1]
            new_label =  pd.DataFrame(
                {
                    "Company": [company],
                    "SSIC Title (Predicted)": [ssic_label],
                    "SSIC Code (Predicted)": [ssic_code],
                    "Explanation": [summary_explanation],
                    "Model Score": [model_score]
                    }
                )

            ssic_labels = pd.concat([ssic_labels, new_label])

    ssic_labels = ssic_labels.reset_index(drop=True)
    fileloader.save_df_to_gcs(ssic_labels, output_fp)

    if config["evaluation_mode"] is not None:
        # Calculate metrics
        evaluator = Evaluator(ssic_labels, label)

        summary_df, acc, pre, recall, f1 = evaluator.evaluate()
        print(f"Accuracy: {acc}")

        # Save concat dataframe
        fileloader.save_df_to_gcs(summary_df, evaluator_fp)
        return summary_df
    else:
        return ssic_labels


In [12]:
import time

class SSICClassifier:
    def __init__(self,
                 classifier_google,
                 classifier,
                 ssic_df,
                 text_embedder,
                 l3_desc_list,
                 l3_desc_list_others,
                 l2_l3_embeddings,
                 l2_l3_embeddings_others,
                 config):
        print("Intializing SSIC Classifier.")
        # Intialize Search
        #self.searcher = SimSearch(elbow_method = True, min_k = 4, max_k = 7)
        self.searcher = SimSearch(elbow_method = False, k=10)
        self.classifier_google = classifier_google
        self.classifier = classifier

        # DataFrame
        self.ssic_df = ssic_df

        # L3 List
        self.l3_desc_list = l3_desc_list
        self.l3_desc_list_others = l3_desc_list_others

        # Embeddings
        self.text_embedder = text_embedder
        self.l2_l3_embeddings = l2_l3_embeddings
        self.l2_l3_embeddings_others = l2_l3_embeddings_others

        # Description
        self.l3_desc_list = l3_desc_list
        self.l3_desc_list_others = l3_desc_list_others

        # Others
        self.config = config

    def search(self, encoded_desc, non_others=True):
        if non_others:
            embeddings = self.l2_l3_embeddings_others
            desc_list = self.l3_desc_list_others
        else:
            embeddings = self.l2_l3_embeddings
            desc_list = self.l3_desc_list

        sorted_indices, scores = self.searcher.search_k_sim(encoded_desc, embeddings)

        rel_l3 = [desc_list[i] for i in sorted_indices]
        return rel_l3

    def produce_desc(self, rel_l):
        cat_dict = self.ssic_df[self.ssic_df['SSIC 2020 Title'].isin(rel_l)].set_index('SSIC 2020 Title')['Detailed Definitions'].to_dict()
        cat_desc = "\n\n\n".join(f"{key}: {value}" for key, value in cat_dict.items())
        return cat_desc

    def get_codes(self, df_cat, l3_cat_final):
        label_line = df_cat[(df_cat[self.config['columns']['ssic_desc']].str.contains(l3_cat_final))|(df_cat[self.config['columns']['ssic_desc']]==l3_cat_final)]
        label_cat = label_line[self.config['columns']['ssic_desc']]
        #print(label_line, label_cat, l3_cat_final)
        if type(label_cat) != str:
            if len(label_cat)>0:
                label_cat = label_cat.iloc[0]
        label_code = label_line[self.config['columns']['ssic_code']]
        if type(label_code) != str:
            if len(label_code)>0:
                label_code = label_code.iloc[0]
        return label_code, label_cat

    def classify(self, summaried_desc):
        """Full pipeline for classification"""

        # If holding company, return holding
        label_holdings = self.classifier_google.identify_holding(summaried_desc)

        if label_holdings:
            label_code, label_cat =  self.get_codes(self.ssic_df, label_holdings)
            return label_code, label_cat, None

        # Encode description
        # For non-holding companies
        l3_exist = False
        l3_exist_others = False

        encoded_desc = self.text_embedder.encode_text([summaried_desc], 1)

        # Search for L2+L3 SSIC
        rel_l3 = self.search(encoded_desc, False)
        print("Initial search: ", rel_l3)

        # Get Description for l3
        cat_desc = self.produce_desc(rel_l3)

        # Perform classification with Palm2
        l3_cat = self.classifier.classify(summaried_desc, rel_l3, cat_desc)
        if type(l3_cat) == tuple:
            l3_cat, l3_score = l3_cat
        else:
            l3_score = None

        # Perform Search for Others
        rel_l3_others = self.search(encoded_desc, True)
        print("Initial search others: ", rel_l3_others)

        # Produce Description
        cat_desc_others = self.produce_desc(rel_l3_others)
        # Classify with Palm 2

        l3_cat_others = self.classifier.classify(summaried_desc, rel_l3_others, cat_desc_others)
        if type(l3_cat_others) == tuple:
            l3_cat_others, l3_score_others = l3_cat_others
        else:
            l3_score_others = None

        # Check existence
        if l3_cat in self.l3_desc_list:
            l3_exist = True
        if l3_cat_others in self.l3_desc_list_others:
            l3_exist_others = True

        # Perform comparison
        desc= self.produce_desc([l3_cat,l3_cat_others])
        l3_cat_final = self.classifier.compare(summaried_desc, l3_cat, l3_cat_others, desc)

        # Double Check LLM's response
        if (l3_cat_final == l3_cat) and l3_exist:
            df_cat = self.ssic_df[self.ssic_df[self.config['columns']['ssic_desc']].isin(self.l3_desc_list)]
            l3_score_final = l3_score
        elif (l3_cat_final == l3_cat_others) and l3_exist_others:
            df_cat = self.ssic_df[self.ssic_df[self.config['columns']['ssic_desc']].isin(self.l3_desc_list_others)]
            l3_score_final = l3_score_others
        elif l3_exist:
            l3_cat_final = l3_cat
            l3_score_final = l3_score
            df_cat = self.ssic_df[self.ssic_df[self.config['columns']['ssic_desc']].isin(self.l3_desc_list)]
        elif l3_exist_others:
            l3_cat_final = l3_cat_others
            l3_score_final = l3_score_others
            df_cat = self.ssic_df[self.ssic_df[self.config['columns']['ssic_desc']].isin(self.l3_desc_list_others)]

        print(f"Final label: {l3_cat_final}, selected from {l3_cat}, {l3_cat_others}")

        if l3_exist_others or l3_exist:
            label_code, label_cat =  self.get_codes(df_cat, l3_cat_final)
            return label_code, label_cat, l3_score_final
        else:
            return None, None, None


In [None]:
bucket_name = "acra-ssic-classification"
classify_from_gcs(bucket_name)

Intializing SSIC Classifier.
Processing: data/fs/ABR HOLDINGS LIMITED 2022.pdf
Calling LLM to summarize pdf...
Summary extracted:  - Manufacture of ice cream
- Operation of Swensenâ€™s ice cream parlours cum restaurants
- Operation of other specialty restaurants
Initial search:  ['Manufacture of ice-cream', 'Wholesale of ice-cream', 'Manufacture of condensed and evaporated milk (including pasteurising and bottling of fluid milk)', 'Retail sale of confectionery and bakery products (not manufactured on site)', 'Manufacture of chocolate and chocolate products', 'Wholesale of confectionery and bakery products', 'Manufacture of bread, cakes and confectionery (excluding frozen bakery products)', 'Refrigerated warehousing and storage', 'Manufacture of biscuits (including wafers and cones)', 'Manufacture of non-chocolate confectionery (e.g. sweets, toffees, crystallised fruits, chewing gums)']
Initial search others:  ['Manufacture of ice except dry ice', 'Manufacture and repair of refrigeratin

Unnamed: 0,Company,SSIC Code,Company.1,SSIC Title (Predicted),SSIC Code (Predicted),Explanation,Model Score
0,ABR HOLDINGS LIMITED,47219,ABR HOLDINGS LIMITED 2022,Manufacture of ice-cream,10503,- The principal activities of the Company are ...,
1,ABUNDANCE INTERNATIONAL LIMITED,46649,ABUNDANCE INTERNATIONAL LIMITED 2022,Commercial printing (e.g. printing of brochure...,18113,- The principal activities of the Company are ...,
2,ABUNDANTE LIMITED,23940,ABUNDANTE LIMITED 2023,Other holding companies,64202,- The principal activities of the Company are ...,
3,ACCRELIST LTD.,46900,ACCRELIST LTD,Other holding companies,64202,- The principal activities of the Company is i...,
4,ACESIAN PARTNERS LIMITED,41009,ACESIAN PARTNERS LIMITED 2022,"Manufacture and repair of refrigerating, air-c...",28191,- The primary activities of the Group consist ...,
...,...,...,...,...,...,...,...
95,TT INTERNATIONAL LIMITED,46900,TT INTERNATIONAL LIMITED 2023,Retail sale of electrical household appliances...,47539,- The primary activities of the Company are th...,
96,UNITED OVERSEAS INSURANCE LIMITED,65124,UNITED OVERSEAS INSURANCE LIMITED 2022,"General insurance (except marine and import, e...",65124,- The Companyâ€™s principal activities are the u...,
97,VENTURE CORPORATION LIMITED,26201,VENTURE CORPORATION LIMITED 2022,Manufacture of printed circuit boards with ele...,26123,- The principal activities of the Company are ...,
98,VICOM LTD,95303,VICOM LTD 2022,Other holding companies,64202,- The company's primary activity is solely hol...,


# Gradio Application

In [1]:
def load_pdf(file_content):
    #file_content = file_content.read()
    doc = fitz.open("pdf", file_content)
    concatenated_text = []
    important_text = ""
    length = 0
    num_pages = 0
    exclude_strings = [
        'auditor',
        "director",
        "accounting policies",
        "corporate governance",
        "lease liabilities",
        "sustainability",
        "consolidated statement of cash flows",
        "financial risk",
        "dividend"
        ]
    for page_num in range(doc.page_count):
        if page_num >=2:
            page = doc.load_page(page_num)
            page_text = page.get_text()
            num_pages += 1
            # Check if any of the exclude_strings are present on the page
            if (re.search(re.compile(r'note(s)?\s*to\s*(the)?\s*financial\s*statement(s)?', re.IGNORECASE), page_text)) and ("activit" in page_text.lower()):
                important_text += page_text
                continue
            if any(exclude_str in page_text.lower() for exclude_str in exclude_strings):
                if "activit" not in page_text.lower():
                    continue  # Skip this page if any exclude_string is found
            concatenated_text.append(page_text)
    if len(important_text) >= 8000:
        important_text = important_text[:8000]
    return '\n'.join(concatenated_text), important_text


def gradio_pipeline(file_path):
    bucket_name = "acra-ssic-classification"
    fileloader = FileLoader(bucket_name)
    config = fileloader.read_yaml_from_gcs("config.yaml")

    project_id = config["project_id"]
    #others_score_threshold = config["others_score_threshold"]
    text_embedding_model_name =  config["text_embedding_model_name"]
    classifier_model_name = config["classifier_model_name"]
    summarizer_model_name = config["summarizer_model_name"]
    google_model_name = config["google_model_name"]

    batch_size = config["batch_size"]

    # Load directories
    model_location = config["dir"]["model_location"]
    tokenizer_location = config["dir"]["tokenizer_location"]
    ssic_df_fp = config["dir"]["ssic_df_fp"]
    l3_list_fp = config["dir"]["l3_list_fp"]
    l3_list_others_fp = config["dir"]["l3_list_others_fp"]
    activity_embedding_fp = config["dir"]["activity_embedding_fp"]
    l2_l3_embeddings_fp = config["dir"]["l2_l3_embeddings_fp"]
    l2_l3_embeddings_others_fp = config["dir"]["l2_l3_embeddings_others_fp"]
    fs_dir = config["dir"]["fs_dir"]
    output_fp = config["dir"]["output_fp"]
    label_fp = config["dir"]["label_fp"]
    evaluator_fp = config["dir"]["evaluator_fp"]

    # Load artifacts
    ssic_df = fileloader.load_df_from_gcs(ssic_df_fp, header = config['columns']["header_row"])
    l3_desc_list = fileloader.load_json_from_gcs(l3_list_fp)
    l3_desc_list_others = fileloader.load_json_from_gcs(l3_list_others_fp)
    l2_l3_embeddings = fileloader.load_embeddings(l2_l3_embeddings_fp, text_embedding_model_name)
    l2_l3_embeddings_others = fileloader.load_embeddings(l2_l3_embeddings_others_fp, text_embedding_model_name)
    activity_embedding = fileloader.load_embeddings(activity_embedding_fp, text_embedding_model_name)
    label = fileloader.load_csv_from_gcs(label_fp)

    # Load artifacts
    text_embedder = TextEmbedder(text_embedding_model_name,
                                      from_local = True,
                                      project_id=project_id,
                                      bucket_name=bucket_name,
                                      model_location = model_location,
                                      tokenizer_location = tokenizer_location)

    fileloader = FileLoader(bucket_name)
    config = fileloader.read_yaml_from_gcs("config.yaml")

    project_id = config["project_id"]
    #others_score_threshold = config["others_score_threshold"]
    text_embedding_model_name =  config["text_embedding_model_name"]
    classifier_model_name = config["classifier_model_name"]
    summarizer_model_name = config["summarizer_model_name"]
    google_model_name = config["google_model_name"]

    batch_size = config["batch_size"]

    # Load directories
    model_location = config["dir"]["model_location"]
    tokenizer_location = config["dir"]["tokenizer_location"]
    ssic_df_fp = config["dir"]["ssic_df_fp"]
    l3_list_fp = config["dir"]["l3_list_fp"]
    l3_list_others_fp = config["dir"]["l3_list_others_fp"]
    activity_embedding_fp = config["dir"]["activity_embedding_fp"]
    l2_l3_embeddings_fp = config["dir"]["l2_l3_embeddings_fp"]
    l2_l3_embeddings_others_fp = config["dir"]["l2_l3_embeddings_others_fp"]
    fs_dir = config["dir"]["fs_dir"]
    output_fp = config["dir"]["output_fp"]
    label_fp = config["dir"]["label_fp"]
    evaluator_fp = config["dir"]["evaluator_fp"]


    ssic_df = fileloader.load_df_from_gcs(ssic_df_fp, header = config['columns']["header_row"])
    l3_desc_list = fileloader.load_json_from_gcs(l3_list_fp)
    l3_desc_list_others = fileloader.load_json_from_gcs(l3_list_others_fp)
    l2_l3_embeddings = fileloader.load_embeddings(l2_l3_embeddings_fp, text_embedding_model_name)
    l2_l3_embeddings_others = fileloader.load_embeddings(l2_l3_embeddings_others_fp, text_embedding_model_name)
    activity_embedding = fileloader.load_embeddings(activity_embedding_fp, text_embedding_model_name)
    label = fileloader.load_csv_from_gcs(label_fp)

    summarizer = FsSummarizer(
        project_id,
        summarizer_model_name,
        max_decode_steps = 256,
        temperature = 0.0,
        top_p = 1.0,
        top_k = 10,
        chunk_size = 1000,
        chunk_overlap = 50,
        text_embedder= text_embedder,
        activity_embedding= activity_embedding,
        k_sim = 5,
        location= "asia-southeast1"
        )

    classifier_google = Classifier(project_id,
                            google_model_name,
                            max_decode_steps = 256,
                            temperature =0.0,
                            top_p = 1.0,
                            top_k = 10,
                            location= "asia-southeast1"
                            )

    classifier = Classifier(project_id,
                            classifier_model_name,
                            max_decode_steps = 256,
                            temperature =0.0,
                            top_p = 1.0,
                            top_k = 10,
                            location= "asia-southeast1"
                            )

    ssic_classifier = SSICClassifier(
        classifier_google,
        classifier,
        ssic_df,
        text_embedder,
        #others_score_threshold,
        l3_desc_list,
        l3_desc_list_others,
        l2_l3_embeddings,
        l2_l3_embeddings_others,
        config
        )
    fs, important_text = load_pdf(file_path)

    relevant_chunks, summarized_desc = summarizer.run_summarizer(fs, important_text, 10)
    summary_explanation = summarizer.explain(summarized_desc, relevant_chunks)

    print("Summary extracted: ", summarized_desc)
    ## Perform Classification
    ssic_code, ssic_label, model_score = ssic_classifier.classify(summarized_desc)
    if model_score:
        return str(ssic_code) + " - " + ssic_label + ". Score: "+ str(np.round(model_score,2)) +"\nExplanation: " + summary_explanation
    else:
        return str(ssic_code) + " - " + ssic_label +". Explanation: " + summary_explanation

In [15]:
# Gradio interface
import typing_extensions
import gradio as gr
iface = gr.Interface(
    fn=gradio_pipeline,
    inputs=gr.File(type="binary", label="Upload One PDF File"),
    outputs="text",
    live=True,
    title="SSIC Code Classification",
    description=""
)

# Launch the Gradio interface
iface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://85c2b42a9215485747.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


