In [4]:
# Import the necessary libraries

# Model imports
from transformers import BertTokenizer, BertModel   #  FinBERT model and tokenizer from the Hugging Face Transformers library
import torch    # PyTorch library for gradient removal and tensor operations

# Text processing imports
from langchain_text_splitters import RecursiveCharacterTextSplitter # Recursive text splitter for splitting text into sentences

# Document imports
import pymupdf  # PyMuPDF document text extraction of PDFs
from langchain.docstore.document import Document    # Document class for storing document text and metadata
import uuid # UUID for generating unique document IDs

# Database loader and vector store imports
from astrapy.client import DataAPIClient
# Environment variable loader imports
import os   # OS library for environment variable access
from dotenv import load_dotenv  # Load environment variables from a .env file

# JSON library for parsing JSON strings
import json 

load_dotenv()

# Access environment variables from the dictionary
astra_db_endpoint = os.getenv('ASTRA_DB_ENDPOINT')
astra_db_token = os.getenv('ASTRA_DB_TOKEN')
astra_api_key = os.getenv('ASTRA_API_KEY')
astra_db_id = os.getenv('ASTRA_DB_ID')
keyspace = os.getenv('ASTRA_DB_KEYSPACE')
collection_name = os.getenv('ASTRA_DB_COLLECTION_NAME')

# Print the environment variables to verify they are loaded correctly
print("Astra DB API Endpoint:", astra_db_endpoint)
print("Astra DB Application Token:", astra_db_token)
print("Astra API Key:", astra_api_key)
print("Astra DB ID:", astra_db_id)
print("Keyspace:", keyspace)
print("Collection Name:", collection_name)

Astra DB API Endpoint: https://814d766c-e613-47da-84c6-8edc49bd3afa-us-east-2.apps.astra.datastax.com
Astra DB Application Token: AstraCS:YkipjdaXgBLnnrSMnabpmTdF:357a56fd9232b6cd55771dc806c4065a0d4e5e92e4765749506728fbf8282520
Astra API Key: None
Astra DB ID: None
Keyspace: financial_data
Collection Name: reddit_earnings_call_transcripts


In [12]:
# Step 1: Connect to Astra DB and initialize the FinBERT model and tokenizer

#initialize the DataAPIClient with the application token
client = DataAPIClient(astra_db_token)

# Initialize the database using the API endpoint
db=client.get_database_by_api_endpoint(
    api_endpoint=astra_db_endpoint,
    keyspace=keyspace,
)
print(f"Connected to Astra DB: {db.list_collection_names()}")

tokenizer = BertTokenizer.from_pretrained("ProsusAI/finbert")
model = BertModel.from_pretrained("ProsusAI/finbert")
print(model)

pdf_path = "/Users/vivakepandey/Python Projects/financial_analyst/data/raw_data/reddit_earnings_call_transcript.pdf"
print(f"Extracting text from PDF file: {pdf_path}")

# Function to generate embeddings using FinBERT
def generate_embedding(text):
    """
    Generates an embedding for the given text using a pre-trained transformer model.
    Args:
        text (str): The input text to be embedded.
    Returns:
        numpy.ndarray: The embedding of the input text as a numpy array.
    """
    
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
              
    # Get the model's outputs (hidden states)
    with torch.no_grad():   # Disable gradient calculation for faster processing
        outputs = model(**inputs)
    
    # Access generated embedding from the [CLS] token (last hidden state of the first token of the sequence)
    embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()    
    return embedding

Connected to Astra DB: ['reddit_earnings_call_transcripts']
BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

In [45]:
import spacy
import pandas as pd

nlp = spacy.load("en_core_web_lg")
doc = pymupdf.open(pdf_path)  # Open the PDF document and load it into a doc
text = ""
entities = []

# Extract text and ner from each page 
for i, page in enumerate(doc):
    page_text = page.get_text()
    text += page_text
    doc_nlp = nlp(page_text)
    
    # Create a DataFrame from the extracted entities
    df_entities = pd.DataFrame(entities, columns=['Label', 'Entity'])
    df_entities['Page Text'] = page_text
    print(df_entities)
    

Empty DataFrame
Columns: [Label, Entity, Page Text]
Index: []
Empty DataFrame
Columns: [Label, Entity, Page Text]
Index: []
Empty DataFrame
Columns: [Label, Entity, Page Text]
Index: []
Empty DataFrame
Columns: [Label, Entity, Page Text]
Index: []
Empty DataFrame
Columns: [Label, Entity, Page Text]
Index: []
Empty DataFrame
Columns: [Label, Entity, Page Text]
Index: []
Empty DataFrame
Columns: [Label, Entity, Page Text]
Index: []
Empty DataFrame
Columns: [Label, Entity, Page Text]
Index: []
Empty DataFrame
Columns: [Label, Entity, Page Text]
Index: []
Empty DataFrame
Columns: [Label, Entity, Page Text]
Index: []
Empty DataFrame
Columns: [Label, Entity, Page Text]
Index: []
Empty DataFrame
Columns: [Label, Entity, Page Text]
Index: []
Empty DataFrame
Columns: [Label, Entity, Page Text]
Index: []
Empty DataFrame
Columns: [Label, Entity, Page Text]
Index: []
Empty DataFrame
Columns: [Label, Entity, Page Text]
Index: []
Empty DataFrame
Columns: [Label, Entity, Page Text]
Index: []
Empty Da

In [34]:
# Step 2: Function to extract text and metadata from PDF using PyMuPDF
def extract_text_and_metadata_from_pdf(pdf_path):
    """
    Extracts text and metadata from a PDF file.
    Args:
        pdf_path (str): The file path to the PDF document.
    Returns:
        tuple: A tuple containing:
            - text (str): The extracted text from the PDF.
            - metadata (dict): The metadata of the PDF, which may include information such as:
                - title (str): The title of the document.
                - author (str): The author of the document.
                - subject (str): The subject of the document.
                - keywords (str): Keywords associated with the document.
                - creator (str): The software used to create the document.
                - producer (str): The software used to produce the document.
                - creationDate (str): The date the document was created.
                - modDate (str): The date the document was last modified.
    # doc: A list of PyMuPDF Document object representing the PDF file.
    #      It contains methods and attributes to interact with the PDF, such as:
    #      - get_text(): Extracts text from a page.
    #      - metadata: A dictionary containing the PDF's metadata.
    """
  
    doc = pymupdf.open(pdf_path)  # Open the PDF document and load it into a doc
    
    text = ""
    
    # Extract text from each page
    for page in doc:
        text += page.get_text()
    print(text)
    # # Extract metadata from the PDF
    # metadata = doc.metadata
    
    # print(f"metadata: {metadata},")
    # print(f"text: {text}")
    
    # return text, metadata   # Return the extracted text and metadata as a tuple


In [125]:
# Step 3: Initialize the RecursiveCharacterTextSplitter
def get_recursive_text_splitter(chunk_size=500):
    """
    This function creates a RecursiveCharacterTextSplitter with a specified chunk size, 
    chunk overlap, and length function. The splitter is used to divide text into smaller 
    chunks while preserving context by overlapping chunks.
    Parameters:
    chunk_size (int): The maximum size of each chunk in tokens. Default is 500.
    Returns:
    RecursiveCharacterTextSplitter: An initialized RecursiveCharacterTextSplitter object.
    """    
    # Initialize the RecursiveTextSplitter
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,  # Max chunk size in tokens
        chunk_overlap=50,       # Overlap between chunks to preserve context
        length_function=len     # Length function to determine chunk size
    )
    return splitter # Return the initialized RecursiveTextSplitter object

In [126]:
# Step 4: Use RecursiveCharacterTextSplitter to chunk the text
def chunk_text(text, splitter):
    """
Chunks the given text using the specified splitter.
This function takes a text string and a splitter object, then uses the splitter's
split_text method to divide the text into smaller chunks.
Args:
    text (str): The text to be chunked.
    splitter (object): An object with a split_text method that handles the chunking.
Returns:
    list: A list of text chunks.
"""
   # Chunk the text using the splitter
    chunks = splitter.split_text(text)  
    return chunks  # Return a list of text chunks 

In [None]:
# Step 5: Create a function to prepare the documents with id, text, embeddings and, metadata
def create_astra_db_document(text, metadata):
    """
    Prepare a JSON-serializable document for insertion into an Astra database collection. 
    Args:
        text (str): The text content of the document.
        metadata (dict): The metadata associated with the document.
    Returns:
        dict: A dictionary representation of the document containing the text, metadata, and embedding.
    """
    
    # Generate the embedding for the document chunk
    embedding = generate_embedding(text)
    
    # Create the document dictionary with Astra DB required fields (Id, text, metadata, and embedding)
    doc = {
        "id": str(uuid.uuid4()),  # Generate a unique ID for the document
        "text": text,
        "metadata": metadata,
        "vector": embedding.tolist()  # Convert the embedding to a list
    }
    return doc  # Return the dictionary representation of the document

In [128]:
# Function to split the documents into batches of 20
def batch(documents, batch_size):
    for i in range(0, len(documents), batch_size):
        yield documents[i : i + batch_size]

In [None]:
# Step 6: Define the Astra DB insertion function
# Function to insert documents into Astra DB in batches
def insert_documents_into_astra_db(documents):
    """
    Inserts a list of documents (metadata, text, embedding) into an Astra DB collection in batches.
    Args:
        documents (list): A list of document dictionaries to be inserted into the Astra DB.
    """
    # Define the collection in the database
    collection = db.get_collection(collection_name)
    
    # Insert the documents in batches
    for batch_documents in batch(documents, 50):
        # Insert the batch of documents into the collection
        res = collection.insert_many(batch_documents)
        print(f"Inserted batch with response: {res}")

In [131]:
# Main Workflow

# Step 7: Process the PDF, extract metadata, and insert documents into Astra DB collection
def process_pdf_to_astra(pdf_path):
    # Extract text and metadata from the PDF
    pdf_text, pdf_metadata = extract_text_and_metadata_from_pdf(pdf_path)
    print("Extracted PDF Metadata:", pdf_metadata) # Print the extracted metadata to indicate successful extraction
    
    # Initialize RecursiveTextSplitter
    splitter = get_recursive_text_splitter(chunk_size=500)
    print("RecursiveTextSplitter Initialized") # Print a message to indicate initialization
    
    # Use the splitter to chunk the extracted text
    chunks = chunk_text(pdf_text, splitter)
    print(f"Chunked PDF into {len(chunks)} text chunks.") # Print the number of chunks
    
    # Enhance metadata with additional info (you could add more info based on use case)
    metadata = {
        "source": pdf_path,
        "title": pdf_metadata.get('title', 'Unknown Title'),
        "author": pdf_metadata.get('author', 'Unknown Author'),
        "subject": pdf_metadata.get('subject', 'Unknown Subject'),
        "created": pdf_metadata.get('created', 'Unknown Date'),
        "modified": pdf_metadata.get('modified', 'Unknown Date'),
    }
    
    # Create Document objects with text chunks and metadata
    documents = []
    for chunk in chunks:    # loop to create a Document object for each chunk
        doc = create_astra_db_document(chunk, metadata)
        documents.append(doc)  
    print(f"Generated {len(documents)} Document objects.") # Print the number of Document objects generated
          
    # Insert documents into Astra DB
    insert_documents_into_astra_db(documents)
    
    # Print or log the number of documents inserted (optional)
    print(f"Processed PDF '{pdf_path}'. Inserted {len(documents)} documents into Astra DB.")

In [132]:
# Example usage:
pdf_path = "/Users/vivakepandey/Python Projects/financial_analyst/data/raw_data/reddit_earnings_call_transcript.pdf"
process_pdf_to_astra(pdf_path)

metadata: {'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Factset Research Systems, Inc.', 'producer': 'PDFlib+PDI 9.0.6 (Perl 5.22.0/Linux-x86_64)', 'creationDate': "D:20241017120220-04'00'", 'modDate': '', 'trapped': '', 'encryption': None},
text: Reddit, Inc.(RDDT-US) Schedules Q3 2024 Earnings Release for 29-October-2024 After Market Hours ET
Thursday, October 03, 2024 09:56:10 PM (GMT)
Reddit, Inc.(RDDT-US) Schedules Q3 2024 Earnings Release for 29-October-2024 After Market Hours ET.
Reddit, Inc.
303 2nd Street, South Tower
5th floor
San Francisco, CA 94107 US
http://www.redditinc.com
Investor Relations Contact(s):
Jesse Rose
Copyright 2024 FactSet Research Systems, Inc. All Rights Reserved.
Event Type: Earnings Release
Industries: Internet
Primary Identifiers: RDDT-US
Regions: US
Related Identifiers: RDDT-US
Reddit, Inc.(RDDT-US) Schedules Q3 2024 Earnings Call for 29-October-2024 5:00 PM ET
Thursday, October 03, 2024 09:56:10 PM (GMT)


In [133]:
# import fitz  # PyMuPDF for PDF extraction
# from transformers import BertTokenizer, BertModel
# from langchain.vectorstores import AstraDB
# from langchain.docstore.document import Document
# from langchain.text_splitter import RecursiveTextSplitter
# import torch
# import uuid

# # Step 1: Load the FinBERT model
# tokenizer = BertTokenizer.from_pretrained("yiyanghkust/finbert")
# model = BertModel.from_pretrained("yiyanghkust/finbert")

# # Function to generate embeddings using FinBERT
# def generate_embedding(text):
#     # Tokenize the input text
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    
#     # Get the model's outputs (hidden states)
#     with torch.no_grad():
#         outputs = model(**inputs)
    
#     # Use the [CLS] token's embedding (last hidden state)
#     embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
#     return embedding

# # Step 2: Function to extract text and metadata from PDF using PyMuPDF
# def extract_text_and_metadata_from_pdf(pdf_path):
#     doc = fitz.open(pdf_path)
#     text = ""
    
#     # Extract text from each page
#     for page in doc:
#         text += page.get_text()
    
#     # Extract metadata from the PDF
#     metadata = doc.metadata
#     return text, metadata

# # Step 3: Initialize the RecursiveTextSplitter
# def get_recursive_text_splitter(chunk_size=500):
#     # Initialize the RecursiveTextSplitter
#     splitter = RecursiveTextSplitter(
#         chunk_size=chunk_size,  # Max chunk size in tokens
#         chunk_overlap=50,       # Overlap between chunks to preserve context
#         length_function=len     # Length function to determine chunk size
#     )
#     return splitter

# # Step 4: Use RecursiveTextSplitter to chunk the text
# def chunk_text(text, splitter):
#     chunks = splitter.split_text(text)
#     return chunks

# # Step 5: Create a function to prepare the documents with embeddings and metadata
# def create_document(text, metadata):
#     # Generate the embedding for the document chunk
#     embedding = generate_embedding(text)
    
#     # Create the Document object with the text, metadata, and embedding
#     doc = Document(
#         page_content=text,
#         metadata=metadata
#     )
    
#     # Add the embedding to the metadata
#     doc.metadata["$vector"] = embedding  # Astra expects this field for vectors
    
#     return doc

# # Step 6: Define the Astra DB insertion function
# def insert_documents_into_astra_db(documents):
#     # Initialize AstraDB vector store (assuming you have your Astra DB connection details)
#     astra_client = AstraDB(
#         api_key="your_api_key", 
#         database_id="your_database_id", 
#         keyspace="your_keyspace", 
#         collection_name="your_collection_name"
#     )

#     # Insert the documents into Astra DB
#     astra_client.add_documents(documents)
#     print(f"Successfully inserted {len(documents)} documents into Astra DB.")

# # Main Workflow

# # Step 7: Process the PDF, extract metadata, and insert documents into Astra DB
# def process_pdf_to_astra(pdf_path):
#     # Extract text and metadata from the PDF
#     pdf_text, pdf_metadata = extract_text_and_metadata_from_pdf(pdf_path)
    
#     # Print or log the extracted metadata (optional)
#     print("Extracted PDF Metadata:", pdf_metadata)
    
#     # Initialize RecursiveTextSplitter
#     splitter = get_recursive_text_splitter(chunk_size=500)
    
#     # Use the splitter to chunk the extracted text
#     chunks = chunk_text(pdf_text, splitter)
    
#     # Enhance metadata with additional info (you could add more info based on use case)
#     metadata = {
#         "source": pdf_path,
#         "title": pdf_metadata.get('title', 'Unknown Title'),
#         "author": pdf_metadata.get('author', 'Unknown Author'),
#         "subject": pdf_metadata.get('subject', 'Unknown Subject'),
#         "created": pdf_metadata.get('created', 'Unknown Date'),
#         "modified": pdf_metadata.get('modified', 'Unknown Date'),
#         "document_id": str(uuid.uuid4())  # Generate a unique document ID
#     }
    
#     # Create Document objects with text chunks and metadata
#     documents = []
#     for chunk in chunks:
#         doc = create_document(chunk, metadata)
#         documents.append(doc)
    
#     # Insert documents into Astra DB
#     insert_documents_into_astra_db(documents)

# # Example usage:
# process_pdf_to_astra("your_earnings_call_transcript.pdf")
