<a href="https://colab.research.google.com/github/vpkrishna/nlp/blob/main/Project_sec_qa_industry.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


## Set-up

In [None]:
!pip install dspy-ai



In [None]:
#from datasets import load_dataset
import openai
import os
import dspy


Save the API keys in a `.env` file in the local root directory as follows. Then, `load_dotenv()` will make them available to the notebook:

In [None]:
lm = dspy.OpenAI(model='gpt-3.5-turbo', api_key='jaisrikrishna')
colbert_server = 'http://index.contextual.ai:8893/api/search'
rm = dspy.ColBERTv2(url=colbert_server)
dspy.settings.configure(lm=lm, rm=rm)

In [None]:
import dspy

class GenerateThemeIndustry(dspy.Signature):
    """
     Your task is to Generate  seven to fifteen sub-industries that give
     exposure to companies associated with the investment theme provided in context.
     Please use the context to  generate the  sub-industries"""
    context = dspy.InputField()
    question=dspy.InputField()
    industries = dspy.OutputField(desc="A comma separated list of seven to fifteen Sub-industries associated with context")

class GenerateThemekeywords(dspy.Signature):
    """
     Your task is to Generate five to ten  keyword associated with the industry given the investment theme in the context.
    The number of keywords must not exceed 10"""
    context = dspy.InputField()
    industry=dspy.InputField()
    keyword = dspy.OutputField(desc="five to ten semantic keywords associated with the each sub-industry")

def convert_to_list(s):
    # Check if the string contains numbered list items
    if '\n' in s and s[1] == '.':
        # Split the string into lines and remove the first three characters from each line
        return [item[3:] for item in s.split('\n') if item[3:] != '']
    else:
        # Split the string by comma
        return [item.strip() for item in s.split(',')]

class ThemeToIndustry(dspy.Module):
    def __init__(self):
        super().__init__()
        # Define the predictor to implement the GenerateThemeIndustry signature.
        self.generate_industry = dspy.ChainOfThought(GenerateThemeIndustry)
        self.generate_keywords = dspy.ChainOfThought(GenerateThemekeywords)

    def forward(self, context, question):
        # Generate sub-industries and their associated bag of key-words
        sub_industries = self.generate_industry(context=context,question=question)
        subindustries=convert_to_list(sub_industries.industries)
        keywords_list=[]
        for sub_industry in subindustries:
            new_keywords = self.generate_keywords(context=context,industry=sub_industry)
            keywordslist=convert_to_list(new_keywords.keyword)
            keywords_list.extend(keywordslist)
        return dspy.Prediction(industries=subindustries,keywords=keywords_list)


In [None]:
my_question = "Generate seven to fifteen sub-industries that give exposure\
                          to companies associated with the investment theme provided in context"
context='Aerospace & Defence'
theme_pred = ThemeToIndustry()(context=context,question=my_question)
print(theme_pred.industries)
print(theme_pred.keywords)

['Aircraft manufacturing', 'Defense electronics', 'Aerospace components', 'Military vehicles', 'Space exploration', 'Defense contractors', 'Cybersecurity for defense', 'Satellite technology', 'Unmanned aerial vehicles (drones)', ' Air traffic control systems', ' Defense logistics', ' Missile defense systems', ' Military training and simulation', ' Defense consulting services', ' Aerospace engineering and design']
['Aircraft design', 'Composite materials', 'Aviation regulations', 'Market demand', 'Supply chain management', 'Engine technology', 'Defense contracts', 'Maintenance, repair, and overhaul (MRO)', 'Aerospace engineering', ' Innovation in aviation', 'Radar systems', 'Communication systems', 'Electronic warfare', 'Surveillance technology', 'Missile guidance systems', 'Cybersecurity', 'Defense contractors', 'Military electronics', 'Signal processing', ' Avionics', 'Aircraft engines', 'Avionics systems', 'Composite materials', 'Precision machining', 'Aerospace fasteners', 'Hydrauli

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/2Stanford/2nlp/Hybrid_Mkt_GPT_4_Theme_Industry_Keywords.csv')
trainsets = []
df_grouped = df.groupby('Theme_Name').agg({
    'Sub_Industry': ', '.join,
    'Keywords': ', '.join
}).reset_index()
for _, row in df_grouped.iterrows():
    context = f"Given the context as {row['Theme_Name']}"
    question = "Generate seven to fifteen sub-industries that give exposure to companies associated with the investment theme provided in context"

    industries = row['Sub_Industry']
    keyword = row['Keywords']
    example = dspy.Example(context=context, question=question, industries=industries, keyword=keyword).with_inputs('context','question')
    trainsets.append(example)

In [None]:
trainset=trainsets[0:19]
ootset=trainsets[19:]

In [None]:
len(ootset)

6

In [None]:
df_grouped

Unnamed: 0,Theme_Name,Sub_Industry,Keywords
0,Aerospace & Defence,"Commercial Aircraft Manufacturing, Military Ai...","airliners, passenger jets, cargo planes, aeros..."
1,Aging Population,"Senior Healthcare Services, Assisted Living Fa...","geriatric care, home healthcare, telehealth se..."
2,Autonomous Technology & Robotics,"Autonomous Transportation, Robotics and Automa...","self-driving cars, autonomous vehicles, electr..."
3,Biotechnology,"Pharmaceutical Biotechnology, Agricultural Bio...","drug discovery, therapeutic proteins, monoclon..."
4,Blockchain and Tech,"Cryptocurrency Exchanges, Blockchain Developme...","cryptocurrency trading, exchange platform, dig..."
5,Clean Energy,"Solar Power Generation, Wind Power Generation,...","photovoltaic systems, solar panels, solar farm..."
6,Cloud Computing,"Infrastructure-as-a-Service (IaaS), Platform-a...","virtual servers, cloud storage, network infras..."
7,Cybersecurity,"Network Security, Endpoint Security, Identity ...","intrusion detection systems, firewalls, virtua..."
8,Data Center REITs & Digital Infrastructure,Data Center Real Estate Investment Trusts (REI...,"data center leasing, colocation services, data..."
9,Fintech Innovation,"Transaction Innovations, Blockchain Technology...","payment processing, digital wallets, contactle..."


In [None]:
import json

data = []

# Iterate over the DataFrame
for i, row in df_grouped.iterrows():
    formatted_data = {
        'messages': [
            {'role': 'system', 'content': 'Generate seven to fifteen sub-industries that give exposure to companies associated with the investment theme provided in context.The sub-industries\
                                         generated must not be more than 10. Also generate keywords for the generated sub-industries'},
            {'role': 'user', 'content': f'The chosen context is {row["Theme_Name"]}.'},
            {'role': 'assistant', 'content': f'The sub-industries are {row["Sub_Industry"]}. The keywords are {row["Keywords"]}.'}
        ]
    }

    # Add the formatted data to the list
    data.append(formatted_data)

# Split the data into training and validation sets
training_data = data[:10]
validation_data = data[10:15]
oot_data=data[15:]

In [None]:
import json

data = []

# Iterate over the DataFrame
for i, row in df_grouped.iterrows():
    formatted_data = {
        'messages': [
            {'role': 'system', 'content': 'Generate seven to fifteen sub-industries that give exposure to companies associated with the investment theme provided in context.The sub-industries\
                                         generated must not be more than 10. Also generate keywords for the generated sub-industries. I want the answer \
                to be written as The sub-industries are and then sub_industries.Likewise for keywords, I want the answer as The keywords are and then keywords.'},
            {'role': 'user', 'content': f'The chosen context is {row["Theme_Name"]}.'},
            {'role': 'assistant', 'content': f'The sub-industries are {row["Sub_Industry"]}. The keywords are {row["Keywords"]}.'}
        ]
    }

    # Add the formatted data to the list
    data.append(formatted_data)

oot_data_untrained=data[19:]

In [None]:
import json

training_file_name = "training_data.jsonl"
validation_file_name = "validation_data.jsonl"
oot_file_name="oot_data.jsonl"
oot_untrained_file_name="oot_data_untrained.jsonl"

def prepare_data(dictionary_data, final_file_name):
    with open(final_file_name, 'w') as outfile:
        for entry in dictionary_data:
        	json.dump(entry, outfile)
        	outfile.write('\n')

prepare_data(training_data, "training_data.jsonl")
prepare_data(validation_data, "validation_data.jsonl")
prepare_data(oot_data, "oot_data.jsonl")
prepare_data(oot_data_untrained, "oot_data_untrained.jsonl")

In [None]:
def load_jsonl(input_path):
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

# Load the data
training_data = load_jsonl("training_data.jsonl")

# Now, you can print the data
for entry in training_data:
    print(entry)

{'messages': [{'role': 'system', 'content': 'Generate seven to fifteen sub-industries that give exposure to companies associated with the investment theme provided in context.The sub-industries                                         generated must not be more than 10. Also generate keywords for the generated sub-industries'}, {'role': 'user', 'content': 'The chosen context is Aerospace & Defence.'}, {'role': 'assistant', 'content': 'The sub-industries are Commercial Aircraft Manufacturing, Military Aircraft and Systems, Space Systems and Exploration, Defense Equipment and Armaments, Aerospace Components and Systems, Unmanned Aerial Vehicles (UAVs), Aerospace Maintenance and Services. The keywords are airliners, passenger jets, cargo planes, aerospace engineering, aviation technology, assembly lines, fighter jets, military drones, surveillance aircraft, missile systems, defense electronics, tactical aircraft, satellites, space probes, launch vehicles, space stations, lunar exploration,

In [None]:
import os
from openai import OpenAI

openai.api_key='jaisrikrishna'
client = OpenAI(api_key='Harisarvothama')

In [None]:

from pathlib import Path
from openai import OpenAI

# Define the paths to your training and validation data files
training_file_path = Path('/content/training_data.jsonl')
validation_file_path = Path('/content/validation_data.jsonl')

training_response = client.files.create(
    file=training_file_path,
    purpose='fine-tune',
)

# Get the file ID from the response
training_file_id = training_response.id

# Upload the validation data
validation_response = client.files.create(
    file=validation_file_path,
    purpose='fine-tune',
)

# Get the file ID from the response
validation_file_id = validation_response.id

# Print the file IDs
print('Training file id:', training_file_id)
print('Validation file id:', validation_file_id)

Training file id: file-Rvo6yDcKZCby71udM7G9OdDP
Validation file id: file-sq3o1xClFf2MXdmkgExZTtpS


In [None]:
training_response

FileObject(id='file-Rvo6yDcKZCby71udM7G9OdDP', bytes=14617, created_at=1713249064, filename='training_data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [None]:
untrained_model="gpt-3.5-turbo"

In [None]:
response = client.fine_tuning.jobs.create(
  training_file=training_file_id,
  validation_file=validation_file_id,
  model="gpt-3.5-turbo",
)
job_id = response.id
status = response.status

print(f'Fine-tunning model with jobID: {job_id}.')
print(f"Training Response: {response}")
print(f"Training Status: {status}")

Fine-tunning model with jobID: ftjob-oHoakXA5i7MibqG4ijiIPvLy.
Training Response: FineTuningJob(id='ftjob-oHoakXA5i7MibqG4ijiIPvLy', created_at=1713249074, error=Error(code=None, message=None, param=None, error=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-nOf04gzKve7yBkytwqAAom71', result_files=[], seed=1013993064, status='validating_files', trained_tokens=None, training_file='file-Rvo6yDcKZCby71udM7G9OdDP', validation_file='file-sq3o1xClFf2MXdmkgExZTtpS', integrations=[], user_provided_suffix=None)
Training Status: validating_files


In [None]:
import time

status = client.fine_tuning.jobs.retrieve(job_id).status
if status not in ["succeeded", "failed"]:
    print(f"Job not in terminal status: {status}. Waiting.")
    while status not in ["succeeded", "failed"]:
        time.sleep(2)
        status = client.fine_tuning.jobs.retrieve(job_id).status
        print(f"Status: {status}")
else:
    print(f"Finetune job {job_id} finished with status: {status}")
print("Checking other finetune jobs in the subscription.")
result = client.fine_tuning.jobs.list()
print(f"Found {len(result.data)} finetune jobs.")

Job not in terminal status: validating_files. Waiting.
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: queued
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: ru

In [None]:
# Retrieve the finetuned model
fine_tuned_model = result.data[0].fine_tuned_model
print(fine_tuned_model)

ft:gpt-3.5-turbo-0125:personal::9EWmoAX1


In [None]:
# fine_tuned_model="ft:gpt-3.5-turbo-0125:personal::9ETXqOlY"

In [None]:
completion = client.chat.completions.create(
  model=fine_tuned_model,
  messages=[
    {'role': 'system',
    'content': 'Generate seven to fifteen sub-industries that give exposure to companies associated with the investment theme provided in context.'},
   {'role': 'user', 'content': 'The chosen context is Aerospace & Defence.'}
  ],temperature=0, max_tokens=500
)
print(completion.choices[0].message)

ChatCompletionMessage(content='The sub-industries are Commercial Aircraft Manufacturing, Military Aircraft and Systems, Space Systems and Exploration, Defense Equipment and Armaments, Aerospace Components and Systems, Unmanned Aerial Vehicles (UAVs), Aerospace Maintenance and Services.', role='assistant', function_call=None, tool_calls=None)


In [None]:
test_messages=  messages=[
    {'role': 'system','content': 'Generate seven to fifteen sub-industries that give exposure to companies associated with the investment theme provided in context.'},
   {'role': 'user', 'content': 'The chosen context is Large Language Models.'}
  ]

In [None]:
response = client.chat.completions.create(
    model=fine_tuned_model,
    messages=test_messages,
    temperature=0, max_tokens=500
)
print(response.choices[0].message)

ChatCompletionMessage(content='The sub-industries are Natural Language Processing (NLP), Machine Learning, Deep Learning, Data Labeling and Annotation, Text-to-Speech, Speech Recognition, Language Generation Models, Conversational AI.', role='assistant', function_call=None, tool_calls=None)


In [None]:
model="gpt-3.5-turbo"
completion = client.chat.completions.create(
  model=model,
  messages=[
    {'role': 'system',
    'content': 'Generate seven to fifteen sub-industries that give exposure to companies associated with the investment theme provided in context. I want the answer \
                to be written as The sub-industries are and then sub_industries.Likewise for keywords, I want the answer as The keywords are and then keywords'},
   {'role': 'user', 'content': 'The chosen context is Aerospace & Defence.'}
  ]
)
print(completion.choices[0].message)

ChatCompletionMessage(content='The sub-industries are:\n1. Commercial Aerospace\n2. Military Aerospace\n3. Aerospace Components\n4. Defense Contractors\n5. Cybersecurity for Defense\n6. Space Exploration\n7. Unmanned Aerial Vehicles (UAVs)\n8. Aerospace Maintenance, Repair, and Overhaul (MRO)\n  \nThe keywords are:\n1. Aerospace industry\n2. Defense sector\n3. Fighter jets\n4. Space technology\n5. Military contracts', role='assistant', function_call=None, tool_calls=None)


In [None]:
def load_jsonl(input_path):
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

# Load the data
oot_data_untrained = load_jsonl("oot_data_untrained.jsonl")

# Now, you can print the data
for entry in oot_data_untrained:
    print(entry)

{'messages': [{'role': 'system', 'content': 'Generate seven to fifteen sub-industries that give exposure to companies associated with the investment theme provided in context. I want the answer                 to be written as The sub-industries are and then sub_industries.Likewise for keywords, I want the answer as The keywords are and then keywords.'}, {'role': 'user', 'content': 'The chosen context is Semiconductor.'}, {'role': 'assistant', 'content': 'The sub-industries are Semiconductor Fabrication, Semiconductor Design, Semiconductor Equipment and Materials, Microprocessors and Chipsets, Memory and Storage Technology, Semiconductor Distribution, Advanced Semiconductor Technologies. The keywords are foundry services, chip manufacturing, wafer processing, photolithography, cleanroom technology, semiconductor fabrication plants, integrated circuit design, CAD for electronics, microarchitecture, system-on-chip (SoC), electronic design automation (EDA), semiconductor machinery, fabric

In [None]:
fine_tuned_model

'ft:gpt-3.5-turbo-0125:personal::9E9QxsgD'

In [None]:
import json
import openai

# Function to load data from a .jsonl file
def load_jsonl(input_path):
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

# Load the data
oot_data = load_jsonl("oot_data.jsonl")
# Combine training and validation data
all_data = oot_data

list_sub_industries=[]
list_keywords=[]
# Iterate over all data
for conversation in all_data:
    # Extract the messages from the conversation
    messages = conversation['messages']
    messages = [message for message in messages if message['role'] != 'assistant']
    response = client.chat.completions.create(
    model=fine_tuned_model,
    messages=messages,
    temperature=0, max_tokens=500)
    response_text = response.choices[0].message.content
    print(response_text)
    # Split the response_text into two parts: sub_industries and keywords
    sub_industries_part, keywords_part = response_text.split(". The keywords are ")
    # Remove the leading phrase from sub_industries_part and split it into a list
    sub_industries_list = sub_industries_part.replace("The sub-industries are ", "").split(", ")
    # Split keywords_part into a list
    keywords_list = keywords_part.split(", ")
    list_sub_industries.append(sub_industries_list)
    list_keywords.append(keywords_list)



The sub-industries are Semiconductor Manufacturing, Semiconductor Equipment and Materials, Integrated Circuit Design, Semiconductor Foundries, LED Semiconductor Devices, Power Semiconductors, Memory Semiconductors, Semiconductor IP and Design Services. The keywords are wafer fabrication, semiconductor devices, integrated circuits, microchips, electronic components, silicon wafers, gallium arsenide, semiconductor packaging, assembly and testing, semiconductor process technology, lithography, etching, deposition, wafer cleaning, thin film deposition, photomask, semiconductor materials, silicon, compound semiconductors, semiconductor nanoparticles, quantum dots, LED chips, solid state lighting, LED displays, optoelectronic devices, semiconductor lasers, photovoltaic cells, power diodes, MOSFETs, IGBTs, power modules, voltage regulators, power management ICs, superconducting power devices, memory chips, DRAM, SRAM, flash memory, non-volatile memory, memory controllers, semiconductor IP cor

In [None]:
list_sub_industries

[['Semiconductor Manufacturing',
  'Semiconductor Equipment and Materials',
  'Integrated Circuit Design',
  'Semiconductor Foundries',
  'LED Semiconductor Devices',
  'Power Semiconductors',
  'Memory Semiconductors',
  'Semiconductor IP and Design Services'],
 ['Solar Power Generation',
  'Solar Panel Manufacturing',
  'Solar Farm Development',
  'Concentrated Solar Power',
  'Residential Solar Solutions',
  'Commercial and Industrial Solar Solutions',
  'Solar Technology and Innovation'],
 ['Launch Services',
  'Satellite Manufacturing',
  'Space Exploration',
  'Space Mining',
  'Space Systems and Components',
  'Space Transportation',
  'Space Stations',
  'Lunar Exploration',
  'Mars Exploration'],
 ['Telehealth Services',
  'Remote Patient Monitoring',
  'Digital Health Devices',
  'Medical Robotics',
  'Healthcare Data Analytics',
  'Telemedicine Software Platforms',
  'Chronic Disease Management',
  'Wearable Health Technology',
  'Telehealth Infrastructure'],
 ['Video Game D

In [None]:
import json
import openai

# Function to load data from a .jsonl file
def load_jsonl(input_path):
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

# Load the data
oot_data_untrained = load_jsonl("oot_data_untrained.jsonl")


# Combine training and validation data
all_data =oot_data_untrained

list_sub_industries_untrained=[]
list_keywords_untrained=[]
# Iterate over all data
for conversation in all_data:
    # Extract the messages from the conversation
    messages = conversation['messages']
    messages = [message for message in messages if message['role'] != 'assistant']
    response = client.chat.completions.create(
    model=untrained_model,
    messages=messages,
    temperature=0, max_tokens=500)
    response_text = response.choices[0].message.content
    parts = response_text.split('The keywords are:')
    sub_industries_part = parts[0]
    keywords_part = parts[1]
    sub_industries_list=sub_industries_part.replace('The sub-industries are:', '').split('\n')[1:]
    keywords_list = keywords_part.split('\n')[1:]
    sub_industries_list=[item.strip()[3:] for item in sub_industries_list if item.strip()]
    keywords_list= [item.strip()[3:] for item in keywords_list if item.strip()]
    list_sub_industries_untrained.append(sub_industries_list)
    list_keywords_untrained.append(keywords_list)

In [None]:
df_grouped_copy=df_grouped[19:].copy()
theme_list=df_grouped_copy['Theme_Name'].values.tolist()
df_grouped_copy['Sub_Industry'] = df_grouped_copy['Sub_Industry'].str.split(',').apply(lambda x: [i.strip() for i in x])
industry_list_golden = df_grouped_copy['Sub_Industry'].tolist()
df_grouped_copy['Keywords'] =df_grouped_copy['Keywords'].str.split(',').apply(lambda x: [i.strip() for i in x])
Keywords_list_golden = df_grouped_copy['Keywords'].tolist()

In [None]:
print(len(list_sub_industries), len(industry_list_golden))

6 6


In [None]:
def create_bigrams(lst):
    bigrams = set()
    for sublist in lst:
        for string in sublist:
            words = string.split()
            words=[word.lower().strip() for word in words]
            for i in range(len(words)-1):
                bigrams.add((words[i], words[i+1]))

    return [bigram for bigram in bigrams if 'and' not in bigram and 'for' not in bigram and  'or' not in bigram and 'in' not in bigram]



def compare_bigrams(industry_list, industry_list_golden):
    bigrams_industry = create_bigrams(industry_list)
    bigrams_golden = create_bigrams(industry_list_golden)

    tp = sum(1 for bigram in bigrams_industry if bigram in bigrams_golden)
    fp = sum(1 for bigram in bigrams_industry if bigram not in bigrams_golden)
    fn = sum(1 for bigram in bigrams_golden if bigram not in bigrams_industry)
    return tp, fp, fn , tp/(tp+fp) , tp/(tp+fn)

## OOT results

In [None]:
print(len(list_sub_industries),len(industry_list_golden))
tp, fp, fn ,precision, recall = compare_bigrams(list_sub_industries, industry_list_golden)
print(f"TP: {tp}, FP: {fp}, FN: {fn}, Precision: {precision}, Recall : {recall}")

6 6
TP: 12, FP: 52, FN: 46, Precision: 0.1875, Recall : 0.20689655172413793


## Untrained model results

In [None]:
list_sub_industries=list_sub_industries_untrained
industry_list_golden=list_keywords_untrained
print(len(list_sub_industries),len(industry_list_golden))
tp, fp, fn ,precision, recall = compare_bigrams(list_sub_industries, industry_list_golden)
print(f"TP: {tp}, FP: {fp}, FN: {fn}, Precision: {precision}, Recall : {recall}")

6 6
TP: 3, FP: 50, FN: 42, Precision: 0.05660377358490566, Recall : 0.06666666666666667


DSPy optimization

In [None]:
df_grouped

Unnamed: 0,Theme_Name,Sub_Industry,Keywords
0,Aerospace & Defence,"Commercial Aircraft Manufacturing, Military Ai...","airliners, passenger jets, cargo planes, aeros..."
1,Aging Population,"Senior Healthcare Services, Assisted Living Fa...","geriatric care, home healthcare, telehealth se..."
2,Autonomous Technology & Robotics,"Autonomous Transportation, Robotics and Automa...","self-driving cars, autonomous vehicles, electr..."
3,Biotechnology,"Pharmaceutical Biotechnology, Agricultural Bio...","drug discovery, therapeutic proteins, monoclon..."
4,Blockchain and Tech,"Cryptocurrency Exchanges, Blockchain Developme...","cryptocurrency trading, exchange platform, dig..."
5,Clean Energy,"Solar Power Generation, Wind Power Generation,...","photovoltaic systems, solar panels, solar farm..."
6,Cloud Computing,"Infrastructure-as-a-Service (IaaS), Platform-a...","virtual servers, cloud storage, network infras..."
7,Cybersecurity,"Network Security, Endpoint Security, Identity ...","intrusion detection systems, firewalls, virtua..."
8,Data Center REITs & Digital Infrastructure,Data Center Real Estate Investment Trusts (REI...,"data center leasing, colocation services, data..."
9,Fintech Innovation,"Transaction Innovations, Blockchain Technology...","payment processing, digital wallets, contactle..."


In [None]:
from dspy.teleprompt import BootstrapFewShot
from dspy.teleprompt import LabeledFewShot


def generate_ngrams(s, n):
      s = s.lower()
      tokens = [token for token in s.split(" ") if token != ""]
      ngrams = zip(*[tokens[i:] for i in range(n)])
      return [" ".join(ngram) for ngram in ngrams]

def validate_context_and_answer(example, pred, trace=None):
    print(example.industries.split(","),pred.industries)
    lista=example.industries.split(",")
    listb=pred.industries
    lista = [item.strip() for item in lista]
    lista_bigrams = [generate_ngrams(item, 2) for item in lista]
    score=0
    for b in listb:
      bigrams_b = generate_ngrams(b, 2)
      result = any(set(a) & set(bigrams_b) for a in lista_bigrams)
      score += 5 if result else -1
    industry_EM = True if score >=0 else False
    score=0
    lista=example.keyword.split(",")
    listb=pred.keywords
    lista = [item.strip() for item in lista]
    lista_bigrams = [generate_ngrams(item, 1) for item in lista]
    score=0
    for b in listb:
      bigrams_b = generate_ngrams(b, 1)
      result = any(set(a) & set(bigrams_b) for a in lista_bigrams)
      score += 2 if result else -1
    keyword_EM = True if score >=0 else False
    return industry_EM and keyword_EM
# Set up a basic teleprompter, which will compile our RAG program.
#teleprompter = BootstrapFewShot(metric=validate_context_and_answer)
teleprompter =LabeledFewShot(k=25)

# Compile!
compiled_rag = teleprompter.compile(ThemeToIndustry(), trainset=trainset)

In [None]:
df_grouped_copy=df_grouped[19:].copy()
theme_list=df_grouped_copy['Theme_Name'].values.tolist()
df_grouped_copy['Sub_Industry'] = df_grouped_copy['Sub_Industry'].str.split(',').apply(lambda x: [i.strip() for i in x])
industry_list_golden = df_grouped_copy['Sub_Industry'].tolist()
df_grouped_copy['Keywords'] =df_grouped_copy['Keywords'].str.split(',').apply(lambda x: [i.strip() for i in x])
Keywords_list_golden = df_grouped_copy['Keywords'].tolist()

In [None]:
def find_ind_keywords(context,my_question):
  context=context
  pred= compiled_rag(context=context,question=my_question)
  industry_list.append(pred.industries)
  keyword_list.append(pred.keywords)
  return industry_list,keyword_list

In [None]:
# Ask any question you like to this simple RAG program.
my_question = "Generate seven to fifteen sub-industries that give exposure\
                          to companies associated with the investment theme provided in context"
context='Aerospace & Defence'
pred= compiled_rag(context=context,question=my_question)
print(f"Industries: {pred.industries}")
print(f"Keywords: {pred.keywords}")

Industries: ['Commercial Aircraft Manufacturing', 'Military Aircraft and Systems', 'Space Systems and Exploration', 'Defense Equipment and Armaments', 'Aerospace Components and Systems', 'Unmanned Aerial Vehicles (UAVs)', 'Aerospace Maintenance and Services']
Keywords: ['fuselage assembly', 'wing design', 'avionics systems', 'composite materials', 'aerodynamics', 'aircraft engines', 'landing gear', 'cabin interiors', 'flight control systems', 'safety regulations', 'aircraft certification', 'fighter jets', 'surveillance drones', 'missile defense systems', 'avionics technology', 'stealth technology', 'electronic warfare', 'tactical aircraft', 'defense electronics', 'military drones', 'combat systems', 'space probes', 'lunar exploration', 'Mars missions', 'satellite technology', 'interplanetary exploration', 'space stations', 'asteroid mining', 'space tourism', 'commercial spaceflight', 'astronomical research', 'fighter jets', 'missile systems', 'tactical aircraft', 'armored vehicles', 'n

In [None]:
len(theme_list)

6

In [None]:
# Ask any question you like to this simple RAG program.
my_question = "Generate seven to fifteen sub-industries that give exposure\
                          to companies associated with the investment theme provided in context."
industry_list=[]
keyword_list=[]
for context in theme_list:
  industry_list,keyword_list=find_ind_keywords(context,my_question)

# Print the contexts and the answer.
##print(f"Question: {my_question}")
##print(f"Industries: {pred.industries}")
##print(f"Keywords: {pred.keyword}")

In [None]:
industry_list

[['Semiconductor Manufacturing',
  'Semiconductor Design',
  'Semiconductor Equipment',
  'Semiconductor Materials',
  'Memory Chips',
  'Microprocessors',
  'Analog and Digital Integrated Circuits',
  'Semiconductor Foundries',
  'Semiconductor Packaging and Testing',
  'Application-Specific Integrated Circuits (ASICs)',
  'System-on-Chip (SoC) Design',
  'Field-Programmable Gate Arrays (FPGAs)',
  'Power Management ICs',
  'Optoelectronics',
  'Sensors'],
 ['Solar Panel Manufacturing',
  'Solar Inverter Manufacturing',
  'Solar Energy Storage Solutions',
  'Solar Installation Services',
  'Solar Project Development',
  'Solar Financing and Leasing',
  'Solar Monitoring and Maintenance',
  'Solar Software and Analytics',
  'Solar Microgrid Systems',
  'Solar Thermal Technology',
  'Solar Energy Consulting Services',
  'Solar Energy Education and Training',
  'Solar Energy Research and Development',
  'Solar Energy Integration with Smart Grids',
  'Solar Energy Community Programs'],
 [

In [None]:
keyword_list

[['wafer fabrication',
  'semiconductor packaging',
  'testing equipment',
  'silicon wafers',
  'integrated circuits',
  'microprocessors',
  'memory chips',
  'semiconductor materials',
  'semiconductor equipment suppliers',
  'semiconductor industry trends',
  'integrated circuits',
  'semiconductor manufacturing',
  'chip design',
  'electronic design automation (EDA)',
  'semiconductor materials',
  'silicon wafer fabrication',
  'transistor technology',
  'semiconductor testing',
  'semiconductor packaging',
  'system-on-chip (SoC) design',
  'field-programmable gate arrays (FPGAs)',
  'lithography machines',
  'wafer inspection systems',
  'etching equipment',
  'deposition tools',
  'metrology systems',
  'semiconductor testing equipment',
  'probe stations',
  'semiconductor packaging equipment',
  'cleanroom technology',
  'semiconductor fabrication tools',
  'silicon wafers',
  'gallium arsenide substrates',
  'germanium crystals',
  'semiconductor materials',
  'epitaxial g

In [None]:
print(len(industry_list), len(industry_list_golden))

6 6


In [None]:
def create_bigrams(lst):
    bigrams = set()
    for sublist in lst:
        for string in sublist:
            words = string.split()
            words=[word.lower().strip() for word in words]
            for i in range(len(words)-1):
                bigrams.add((words[i], words[i+1]))

    return [bigram for bigram in bigrams if 'and' not in bigram and 'for' not in bigram and  'or' not in bigram and 'in' not in bigram]



def compare_bigrams(industry_list, industry_list_golden):
    bigrams_industry = create_bigrams(industry_list)
    bigrams_golden = create_bigrams(industry_list_golden)

    tp = sum(1 for bigram in bigrams_industry if bigram in bigrams_golden)
    fp = sum(1 for bigram in bigrams_industry if bigram not in bigrams_golden)
    fn = sum(1 for bigram in bigrams_golden if bigram not in bigrams_industry)

    return tp, fp, fn ,tp/(tp+fp) , tp/(tp+fn)

tp, fp, fn ,precision ,recall  = compare_bigrams(industry_list, industry_list_golden)
print(f"TP: {tp}, FP: {fp}, FN: {fn}, Precision: {precision}, Recall : {recall}")

TP: 19, FP: 114, FN: 39, Precision: 0.14285714285714285, Recall : 0.3275862068965517


In [None]:
def compare_bigrams(keywords_list, keywords_list_golden):
    bigrams_industry = create_bigrams(keywords_list)
    bigrams_golden = create_bigrams(keywords_list_golden)

    tp = sum(1 for bigram in bigrams_industry if bigram in bigrams_golden)
    fp = sum(1 for bigram in bigrams_industry if bigram not in bigrams_golden)
    fn = sum(1 for bigram in bigrams_golden if bigram not in bigrams_industry)


    return tp, fp, fn

tp, fp, fn = compare_bigrams(keyword_list, Keywords_list_golden)
print(f"TP: {tp}, FP: {fp}, FN: {fn}")

NameError: name 'keyword_list' is not defined

DSPy LLM using Turbo3.5 for retrieving the Industry and key words

In [None]:
lm2 = dspy.OpenAI(model='gpt-3.5-turbo', api_key='')
colbert_server = 'http://index.contextual.ai:8893/api/search'
rm2 = dspy.ColBERTv2(url=colbert_server)
dspy.settings.configure(lm=lm2, rm=rm2)

In [None]:
import dspy

class GenerateThemeIndustry(dspy.Signature):
    """
     Your task is to Generate  seven to fifteen sub-industries that give
     exposure to companies associated with the investment theme provided in context.
     Please use the context to  generate the  sub-industries"""
    context = dspy.InputField()
    question=dspy.InputField()
    industries = dspy.OutputField(desc="A comma separated list of seven to fifteen Sub-industries associated with context")

class GenerateThemekeywords(dspy.Signature):
    """
     Your task is to Generate five to ten  keyword associated with the industry given the investment theme in the context.
    The number of keywords must not exceed 10"""
    context = dspy.InputField()
    industry=dspy.InputField()
    keyword = dspy.OutputField(desc="five to ten semantic keywords associated with the each sub-industry")

def convert_to_list(s):
    # Check if the string contains numbered list items
    if '\n' in s and s[1] == '.':
        # Split the string into lines and remove the first three characters from each line
        return [item[3:] for item in s.split('\n') if item[3:] != '']
    else:
        # Split the string by comma
        return [item.strip() for item in s.split(',')]

class ThemeToIndustry(dspy.Module):
    def __init__(self):
        super().__init__()
        # Define the predictor to implement the GenerateThemeIndustry signature.
        self.generate_industry = dspy.ChainOfThought(GenerateThemeIndustry)
        self.generate_keywords = dspy.ChainOfThought(GenerateThemekeywords)

    def forward(self, context, question):
        # Generate sub-industries and their associated bag of key-words
        sub_industries = self.generate_industry(context=context,question=question)
        subindustries=convert_to_list(sub_industries.industries)
        keywords_list=[]
        for sub_industry in subindustries:
            new_keywords = self.generate_keywords(context=context,industry=sub_industry)
            keywordslist=convert_to_list(new_keywords.keyword)
            keywords_list.extend(keywordslist)
        return dspy.Prediction(industries=subindustries,keywords=keywords_list)


In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/2Stanford/2nlp/Hybrid_Mkt_GPT_4_Theme_Industry_Keywords.csv')
trainset = []
df_grouped = df.groupby('Theme_Name').agg({
    'Sub_Industry': ', '.join,
    'Keywords': ', '.join
}).reset_index()
for _, row in df_grouped.iterrows():
    context = f"Given the context as {row['Theme_Name']}"
    question = "Generate seven to fifteen sub-industries that give exposure to companies associated with the investment theme provided in context"

    industries = row['Sub_Industry']
    keyword = row['Keywords']
    example = dspy.Example(context=context, question=question, industries=industries, keyword=keyword).with_inputs('context','question')
    trainset.append(example)

In [None]:
my_question = "Generate seven to fifteen sub-industries that give exposure\
                          to companies associated with the investment theme provided in context"
context='Aerospace & Defence'
theme_pred = ThemeToIndustry()(context=context,question=my_question)
print(theme_pred.industries)
print(theme_pred.keywords)

['Commercial Aerospace', 'Defense Technology', 'Aerospace Components', 'Military Aircraft', 'Space Exploration', 'Defense Electronics', 'Aerospace Engineering', 'Defense Contractors', 'Satellite Communications', ' Air Traffic Control Systems', ' Defense Training and Simulation', ' Unmanned Aerial Vehicles (UAVs)', ' Missile Defense Systems', ' Aerospace Maintenance, Repair, and Overhaul (MRO)', ' Defense Cybersecurity']
['Aircraft manufacturing', 'Airline industry', 'Aerospace engineering', 'Aviation technology', 'Commercial aircraft', 'Aerospace supply chain', 'Passenger air travel', 'Aerospace innovation', 'Aircraft maintenance', ' Aerospace market analysis', 'Advanced weaponry', 'Cybersecurity', 'Surveillance technology', 'Military communication systems', 'Drone technology', 'Radar systems', 'Biometric security', 'Satellite technology', 'Electronic warfare', ' Autonomous vehicles', 'Aircraft parts', 'Aerospace fasteners', 'Avionics components', 'Engine components', 'Structural compo

In [None]:
df_grouped_copy=df_grouped.copy()
theme_list=df_grouped['Theme_Name'].values.tolist()
df_grouped['Sub_Industry'] = df_grouped['Sub_Industry'].str.split(',').apply(lambda x: [i.strip() for i in x])
industry_list_golden = df_grouped['Sub_Industry'].tolist()
df_grouped['Keywords'] = df_grouped['Keywords'].str.split(',').apply(lambda x: [i.strip() for i in x])
Keywords_list_golden = df_grouped['Keywords'].tolist()


In [None]:
Keywords_list_golden

[['airliners',
  'passenger jets',
  'cargo planes',
  'aerospace engineering',
  'aviation technology',
  'assembly lines',
  'fighter jets',
  'military drones',
  'surveillance aircraft',
  'missile systems',
  'defense electronics',
  'tactical aircraft',
  'satellites',
  'space probes',
  'launch vehicles',
  'space stations',
  'lunar exploration',
  'Mars missions',
  'armored vehicles',
  'naval ships',
  'submarines',
  'artillery systems',
  'smart munitions',
  'electronic warfare',
  'avionics',
  'propulsion systems',
  'flight control systems',
  'landing gear',
  'aerospace materials',
  'composite structures',
  'drones',
  'UAV systems',
  'autonomous flight',
  'surveillance drones',
  'combat drones',
  'drone navigation',
  'MRO (maintenance',
  'repair',
  'and overhaul)',
  'retrofitting',
  'upgrades',
  'flight training',
  'simulation systems',
  'logistic support'],
 ['geriatric care',
  'home healthcare',
  'telehealth services',
  'chronic disease managemen

In [None]:
industry_list_golden

[['Commercial Aircraft Manufacturing',
  'Military Aircraft and Systems',
  'Space Systems and Exploration',
  'Defense Equipment and Armaments',
  'Aerospace Components and Systems',
  'Unmanned Aerial Vehicles (UAVs)',
  'Aerospace Maintenance and Services'],
 ['Senior Healthcare Services',
  'Assisted Living Facilities',
  'Pharmaceutical Development for Age-Related Diseases',
  'Medical Devices for Seniors',
  'Nutritional Supplements and Health Foods',
  'Financial Services for Seniors',
  'Age-Friendly Products and Services'],
 ['Autonomous Transportation',
  'Robotics and Automation',
  '3D Printing',
  'Energy Storage',
  'Space Exploration'],
 ['Pharmaceutical Biotechnology',
  'Agricultural Biotechnology',
  'Industrial Biotechnology',
  'Environmental Biotechnology',
  'Medical Devices and Diagnostics',
  'Genomics and Proteomics',
  'Bioinformatics and Data Analysis'],
 ['Cryptocurrency Exchanges',
  'Blockchain Development Platforms',
  'Decentralized Finance (DeFi)',
  'D

In [None]:
def find_ind_keywords(context,my_question):
  print(f"Industries: {industry_list}")
  print(f"Keywords: {keyword_list}")
  context=context
  pred= ThemeToIndustry()(context=context,question=my_question)
  industry_list.append(pred.industries)
  keyword_list.append(pred.keywords)
  return industry_list,keyword_list


In [None]:
# Ask any question you like to this simple RAG program.
my_question = "Generate seven to fifteen sub-industries that give exposure\
                          to companies associated with the investment theme provided in context."
industry_list=[]
keyword_list=[]
for context in theme_list:
  industry_list,keyword_list=find_ind_keywords(context,my_question)




Industries: []
Keywords: []
Industries: [['Integrated Circuits', 'Memory Chips', 'Microprocessors', 'Analog Semiconductors', 'Semiconductor Equipment', 'Semiconductor Materials', 'LED Semiconductors', 'Power Management Semiconductors', 'RF Semiconductors', ' Image Sensors', ' Display Drivers', ' System-on-Chip (SoC)', ' Field-Programmable Gate Arrays (FPGAs)', ' Optoelectronics']]
Keywords: [['Microprocessors', 'Memory chips', 'Analog circuits', 'Digital circuits', 'System-on-chip (SoC)', 'Field-programmable gate arrays (FPGAs)', 'Application-specific integrated circuits (ASICs)', 'Semiconductor fabrication', 'Silicon wafer', ' Semiconductor packaging', 'DRAM (Dynamic Random Access Memory)', 'NAND (Negative-AND) Flash', 'Memory chip manufacturers', 'Semiconductor industry', 'Data storage solutions', 'AI (Artificial Intelligence) applications', 'IoT (Internet of Things) devices', 'Market demand for memory chips', 'Supply chain management', ' Emerging technologies in memory chips', 'CPU 

In [None]:
def create_bigrams(lst):
    bigrams = set()
    for sublist in lst:
        for string in sublist:
            words = string.split()
            words=[word.lower().strip() for word in words]
            for i in range(len(words)-1):
                bigrams.add((words[i], words[i+1]))

    return [bigram for bigram in bigrams if 'and' not in bigram and 'for' not in bigram and  'or' not in bigram and 'in' not in bigram]



def compare_bigrams(industry_list, industry_list_golden):
    bigrams_industry = create_bigrams(industry_list)
    bigrams_golden = create_bigrams(industry_list_golden)

    tp = sum(1 for bigram in bigrams_industry if bigram in bigrams_golden)
    fp = sum(1 for bigram in bigrams_industry if bigram not in bigrams_golden)
    fn = sum(1 for bigram in bigrams_golden if bigram not in bigrams_industry)

    score = 5 * tp - fp - fn

    return score, tp, fp, fn

score, tp, fp, fn = compare_bigrams(industry_list, industry_list_golden)
print(f"Score: {score}, TP: {tp}, FP: {fp}, FN: {fn}")

Score: -93, TP: 17, FP: 137, FN: 41


In [None]:
def compare_bigrams(keywords_list, keywords_list_golden):
    bigrams_industry = create_bigrams(keywords_list)
    bigrams_golden = create_bigrams(keywords_list_golden)

    tp = sum(1 for bigram in bigrams_industry if bigram in bigrams_golden)
    fp = sum(1 for bigram in bigrams_industry if bigram not in bigrams_golden)
    fn = sum(1 for bigram in bigrams_golden if bigram not in bigrams_industry)

    score = 5 * tp - fp - fn

    return score, tp, fp, fn

score, tp, fp, fn = compare_bigrams(keyword_list, Keywords_list_golden)
print(f"Score: {score}, TP: {tp}, FP: {fp}, FN: {fn}")

Score: -591, TP: 87, FP: 841, FN: 185


Hypothesis 2 : Bert vs Finbert

In [None]:
%pip install chromadb
%pip install pydantic-settings




In [None]:
from pydantic_settings import BaseSettings
import chromadb
import pandas
import numpy

In [None]:
import datasets
from datasets import load_dataset
import pandas as pd
dataset = load_dataset("eloukas/edgar-corpus", "year_2020", split="train[:200]")
columns=['cik', 'section_1' ,'section_7']
# Convert to pandas DataFrame
df = pd.DataFrame(dataset)[columns]
df.shape

(5480, 3)

In [None]:
import datasets
from datasets import load_dataset
import pandas as pd
dataset = load_dataset("eloukas/edgar-corpus", "year_2020", split="train[:100]")
columns=['cik', 'section_1' ,'section_7']
# Convert to pandas DataFrame
df = pd.DataFrame(dataset)[columns]
df.shape

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


(100, 3)

In [None]:
df.columns

Index(['cik', 'section_1', 'section_7'], dtype='object')

In [None]:
documents = df['section_1'].tolist()
companies = df['cik'].tolist()


In [None]:
df_cik = pd.read_csv('/content/drive/MyDrive/2Stanford/2nlp/project/cik_companies.csv')

# Create a dictionary mapping from CIK keys to company names
cik_to_name = pd.Series(df_cik['Company Name'].values, index=df_cik['Company CIK Key']).to_dict()

# Replace the CIK keys in your list with the corresponding company names
companies = [cik_to_name.get(int(cik), int(cik)) for cik in companies]


In [None]:
companies

['COMMUNITY BANCORP /VT',
 'TEGAL CORP /DE/',
 'MAXIMUM AWARDS INC',
 'PAYCHEX INC',
 'PRISTINE SOLUTIONS INC.',
 'PACIFIC GREEN TECHNOLOGIES INC.',
 'MCGRATH RENTCORP',
 'FORTERRA, INC.',
 'HOUSTON WIRE & CABLE CO',
 'AIRBNB, INC.',
 1711786,
 'WRAPMAIL, INC.',
 1816261,
 1802665,
 'JOHN BEAN TECHNOLOGIES CORP',
 'SHAKE SHACK INC.',
 'WELLS FARGO COMMERCIAL MORTGAGE TRUST 2015-LC22',
 'FIRST-MID ILLINOIS BANCSHARES INC',
 1740332,
 1820160,
 'POWERMEDCHAIRS',
 'EMR TECHNOLOGY SOLUTIONS, INC.',
 'INVESCO PLC/LONDON/',
 'NOODLES & CO',
 'FIRST REAL ESTATE INVESTMENT TRUST OF NEW JERSEY',
 'GREAT AJAX CORP.',
 'HANOVER INSURANCE GROUP, INC.',
 'REEL ESTATE SERVICES INC.',
 'ZIMMER HOLDINGS INC',
 'FIFTH STREET FINANCE CORP',
 1753233,
 'NAVIENT STUDENT LOAN TRUST 2014-5',
 'STRATTEC SECURITY CORP',
 1822309,
 1779474,
 'SINEXUS INC',
 'ISLAND RADIO, INC.',
 'ALEXANDRIA REAL ESTATE EQUITIES INC',
 1830316,
 'EBAY INC',
 'PLASTRON ACQUISITION CORP II',
 'CENTRAL ILLINOIS PUBLIC SERVICE CO'

In [None]:
%pip install transformers
%pip install torch



In [None]:
import pandas as pd
from transformers import BertModel, BertTokenizer
import torch

In [None]:
from chromadb.config import Settings
chroma_client = chromadb.PersistentClient(path="/content/drive/MyDrive/2Stanford/2nlp/project/chromadb")

In [None]:
chroma_client.delete_collection(name="sec_fillings_Bert")

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
class BERTEmbeddingFunction:
    def __init__(self):
        # Load pre-trained model tokenizer (vocabulary)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        # Load pre-trained model (weights)
        self.model = BertModel.from_pretrained('bert-base-uncased')

    def __call__(self, input):
        # Tokenize input
        encoded_input = self.tokenizer(input, return_tensors='pt', padding=True, truncation=True, max_length=512)

        # Predict hidden states features for each layer
        with torch.no_grad():
            model_output = self.model(**encoded_input)

        # You can access the embeddings via model_output.last_hidden_state
        embeddings = model_output.last_hidden_state

        # Average the token embeddings to get a single vector
        averaged_embeddings = torch.mean(embeddings, dim=1)

        # Convert tensor to numpy array and then to list
        return averaged_embeddings.numpy().tolist()

In [None]:
def split_into_chunks(text, chunk_size=1000):
    text=str(text)
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

chunks = []
metadatas = []
for document, company in zip(documents, companies):
    document_chunks = split_into_chunks(document)
    for i, chunk in enumerate(document_chunks):
        chunks.append(chunk)
        metadatas.append({'id': f"{company}_{i}", 'company': company})

In [None]:
bert_ef = BERTEmbeddingFunction()
sec_collection = chroma_client.get_or_create_collection(name="sec_fillings_Bert",embedding_function=bert_ef)

In [None]:
%%time
sec_collection.add(
    documents=chunks,
    metadatas=metadatas,
    ids=[metadata['id'] for metadata in metadatas])

NameError: name 'sec_collection' is not defined

In [None]:
class BERTQueryFunction:
    def __init__(self):
        # Load pre-trained model tokenizer (vocabulary)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        # Load pre-trained model (weights)
        self.model = BertModel.from_pretrained('bert-base-uncased')

    def __call__(self, query):
        # Tokenize input
        encoded_query = self.tokenizer(query, return_tensors='pt', padding=True, truncation=True, max_length=512)

        # Predict hidden states features for each layer
        with torch.no_grad():
            model_output = self.model(**encoded_query)

        # You can access the embeddings via model_output.last_hidden_state
        embeddings = model_output.last_hidden_state

        # Average the token embeddings to get a single vector
        averaged_embeddings = torch.mean(embeddings, dim=1)

        # Convert tensor to numpy array and then to list
        return averaged_embeddings.numpy().tolist()

In [None]:
# Instantiate the BERTQueryFunction
bert_qf = BERTQueryFunction()

# Use the BERTQueryFunction to convert the query into a BERT embedding
query_embedding = bert_qf("What are the companies associated with Next Generation Internet ?")

# Use the chroma_client to retrieve the most similar documents from the database
results = sec_collection.query(query_embedding, n_results=10)

results

{'ids': [['1386049_38',
   'REEL ESTATE SERVICES INC._38',
   '909494_26',
   'TUCOWS INC /PA/_26',
   '55772_22',
   'KIMBALL INTERNATIONAL INC_22',
   '1604778_2',
   'ROCKY HOLDING, INC._2',
   '1575659_29',
   'POWERMEDCHAIRS_29']],
 'distances': [[39.053680419921875,
   39.053680419921875,
   39.97312927246094,
   39.97312927246094,
   40.19363021850586,
   40.19363021850586,
   41.81492614746094,
   41.81492614746094,
   42.416343688964844,
   42.416343688964844]],
 'metadatas': [[{'company': '1386049', 'id': '1386049_38'},
   {'company': 'REEL ESTATE SERVICES INC.',
    'id': 'REEL ESTATE SERVICES INC._38'},
   {'company': '909494', 'id': '909494_26'},
   {'company': 'TUCOWS INC /PA/', 'id': 'TUCOWS INC /PA/_26'},
   {'company': '55772', 'id': '55772_22'},
   {'company': 'KIMBALL INTERNATIONAL INC',
    'id': 'KIMBALL INTERNATIONAL INC_22'},
   {'company': '1604778', 'id': '1604778_2'},
   {'company': 'ROCKY HOLDING, INC.', 'id': 'ROCKY HOLDING, INC._2'},
   {'company': '1575659

Finbert

In [None]:
from transformers import AutoTokenizer, AutoModel

class FinBERTEmbeddingFunction:
    def __init__(self):
        # Load pre-trained model tokenizer (vocabulary)
        self.tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert')

        # Load pre-trained model (weights)
        self.model = AutoModel.from_pretrained('ProsusAI/finbert')

    def __call__(self, input):
        # Tokenize input
        encoded_input = self.tokenizer(input, return_tensors='pt', padding=True, truncation=True, max_length=512)

        # Predict hidden states features for each layer
        with torch.no_grad():
            model_output = self.model(**encoded_input)

        # You can access the embeddings via model_output.last_hidden_state
        embeddings = model_output.last_hidden_state

        # Average the token embeddings to get a single vector
        averaged_embeddings = torch.mean(embeddings, dim=1)

        # Convert tensor to numpy array and then to list
        return averaged_embeddings.numpy().tolist()


class FinBERTQueryFunction:
    def __init__(self):
        # Load pre-trained model tokenizer (vocabulary)
        self.tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert')

        # Load pre-trained model (weights)
        self.model = AutoModel.from_pretrained('ProsusAI/finbert')

    def __call__(self, query):
        # Tokenize input
        encoded_query = self.tokenizer(query, return_tensors='pt', padding=True, truncation=True, max_length=512)

        # Predict hidden states features for each layer
        with torch.no_grad():
            model_output = self.model(**encoded_query)

        # You can access the embeddings via model_output.last_hidden_state
        embeddings = model_output.last_hidden_state

        # Average the token embeddings to get a single vector
        averaged_embeddings = torch.mean(embeddings, dim=1)

        # Convert tensor to numpy array and then to list
        return averaged_embeddings.numpy().tolist()


In [None]:
# Instantiate the FinBERTEmbeddingFunction
finbert_ef = FinBERTEmbeddingFunction()

In [None]:
chroma_client.delete_collection(name="sec_fillings_finbert")

In [None]:
sec_collection_finbert = chroma_client.get_or_create_collection(name="sec_fillings_finbert",embedding_function=finbert_ef)

In [None]:
%%time
sec_collection_finbert.add(
    documents=chunks,
    metadatas=metadatas,
    ids=[metadata['id'] for metadata in metadatas])


CPU times: user 44min 25s, sys: 24min 3s, total: 1h 8min 28s
Wall time: 14min 14s


In [None]:
# Instantiate the FinBERTQueryFunction
finbert_qf = FinBERTQueryFunction()

# Use the FinBERTQueryFunction to convert the query into a FinBERT embedding
query_embedding = finbert_qf("What are the companies associated with Next Generation Internet ?")

# Use the chroma_client to retrieve the most similar documents from the database
results = sec_collection_finbert.query(query_embedding, n_results=10)

results

{'ids': [['NBH INC_116',
   '1708331_27',
   "LANDS' END, INC._3",
   'VIANET TECHNOLOGY GROUP LTD_67',
   '1825437_29',
   'FERRO CORP_0',
   'PRIME RESOURCE INC_63',
   'SIERRA CONCEPTS, INC._29',
   '1708331_25',
   'ZIMMER HOLDINGS INC_49']],
 'distances': [[72.51307082630936,
   73.04689025878906,
   75.30615234375,
   75.7044448852539,
   76.47148895263672,
   77.24732971191406,
   77.5732421875,
   77.61244201660156,
   77.82809448242188,
   78.21311950683594]],
 'metadatas': [[{'company': 'NBH INC', 'id': 'NBH INC_116'},
   {'company': 1708331, 'id': '1708331_27'},
   {'company': "LANDS' END, INC.", 'id': "LANDS' END, INC._3"},
   {'company': 'VIANET TECHNOLOGY GROUP LTD',
    'id': 'VIANET TECHNOLOGY GROUP LTD_67'},
   {'company': 1825437, 'id': '1825437_29'},
   {'company': 'FERRO CORP', 'id': 'FERRO CORP_0'},
   {'company': 'PRIME RESOURCE INC', 'id': 'PRIME RESOURCE INC_63'},
   {'company': 'SIERRA CONCEPTS, INC.', 'id': 'SIERRA CONCEPTS, INC._29'},
   {'company': 1708331, 