In [1]:
import base64
import logging
import os
import pytz
import pymupdf
import re
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents import IndexDocumentsBatch
from datetime import datetime
from dotenv import load_dotenv
from openai import AzureOpenAI
from typing import Generator, Optional, Sequence

In [2]:
load_dotenv()
logging.basicConfig(level=logging.ERROR, format='%(levelname)s - %(message)s')

In [76]:
def pdf_to_markdown(pdf_path, text_path):
    pdf_document = pymupdf.open(pdf_path)
    text_content = ""

    # Iterate through each page
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text = page.get_text(sort=True)

        # Convert text to Markdown format
        text_content += text + "\n\n"

    with open(text_path, "w", encoding="utf-8") as txt_file:
        txt_file.write(text_content)

    print(f"Text content has been saved to {text_path}")

# Example usage
pdf_path = '../Data Scraping/Bloomberg/006400/20190430_Samsung_SDI_Co_Ltd-_Earnings_Call_2019-4-30_DN000000002630199272.pdf.pdf'
text_path = '../Data Scraping/Bloomberg/006400/20190430_Samsung_SDI_Co_Ltd-_Earnings_Call_2019-4-30_DN000000002630199272.txt'
pdf_to_markdown(pdf_path, text_path)


Text content has been saved to ../Data Scraping/Bloomberg/006400/20190430_Samsung_SDI_Co_Ltd-_Earnings_Call_2019-4-30_DN000000002630199272.txt


In [3]:
file_path = '../Data Scraping/Bloomberg/006400/20190430_Samsung_SDI_Co_Ltd-_Earnings_Call_2019-4-30_DN000000002630199272.txt'
with open(file_path, 'r') as file:
    file_content = file.read()
    
print(file_content)

FINAL TRANSCRIPT                                                                                                        2019-04-30
Samsung SDI Co Ltd (006400 KS Equity)

Q1 2019 Earnings Call

Company Participants
  Kyunghoon Kim
   Michael Son , Head, Electronic Material Strategic Marketing Team
   Unidentified Speaker
   Yoontae Kim , Vice President, Business Management Office
  Young No KWON , Executive Vice President
Other Participants
  Gang Ho Park, Analyst, Daishin Securities
   Ji-San Kim , Analyst, Kiwoom Securities
  Jung Hoon Chang , Analyst, Samsung Securities
  Sung Kyu Kim , Analyst, Daiwa Securities
  Woo-Hyung Cho , Analyst, HSBC Securities
Presentation
Yoontae Kim {BIO 17587061 <GO>}
Good afternoon. This is Yoontae Kim, Vice President of the Business Management Office at
Samsung SDI. Before we begin, I would like to introduce our management team attending
today's conference call. Our CFO, Young-No Kwon; Head of the Battery Strategic Marketing
Team, Michael Son; Head of

In [11]:
def extract_speakers(text: str) -> tuple[list[str], dict[str, str]]:
    # Define section headers
    company_headers = ['Company Participants', 'Executives', 'Corporate Participants']
    other_headers = ['Conference Call Participants', 'Analysts', 'Other Participants']
    
    # Find the start of company participants section
    company_start = None
    for header in company_headers:
        match = re.search(rf'{re.escape(header)}\n', text)
        if match:
            company_start = match.end()
            break
    
    if company_start is None:
        raise ValueError("Company participants section not found")
    
    # Find the end of company participants section (next header or end of text)
    other_headers_escaped = '|'.join([re.escape(h) for h in other_headers + ['Presentation', 'Questions And Answers']])
    next_section_pattern = rf'(?={other_headers_escaped})'
    next_section_match = re.search(next_section_pattern, text[company_start:])
    if next_section_match:
        company_end = company_start + next_section_match.start()
    else:
        company_end = len(text)
    
    company_text = text[company_start:company_end].strip()

    # Find the start of other participants section
    other_start = None
    for header in other_headers:
        match = re.search(rf'{re.escape(header)}\n', text)
        if match:
            other_start = match.end()
            break
    
    if other_start is None:
        raise ValueError("Other participants section not found")
    
    # Find the end of other participants section
    next_section_after_other = re.search(r'\n\n|Presentation|MANAGEMENT DISCUSSION SECTION', text[other_start:], re.IGNORECASE)
    if next_section_after_other:
        other_end = other_start + next_section_after_other.start()
    else:
        other_end = len(text)
    
    other_text = text[other_start:other_end].strip()

    # Extract names from both sections
    company_names = [line.split(",")[0].strip() for line in company_text.split("\n")]
    other_names = [line.split(",")[0].strip() for line in other_text.split("\n")]
    
    # Combine names and create position dictionary
    names = company_names + other_names
    position_dict = {name: 'Insider' for name in company_names}
    position_dict.update({name: 'Outsider' for name in other_names})
    
    return names, position_dict

In [None]:
def split_content_by_speakers(content: str, speakers: list[str]) -> list[tuple[str, str]]:
    # Create pattern to match speaker name followed by {BIO <number> <GO>} and all text until next speaker or end
    speaker_pattern = '|'.join(re.escape(speaker) for speaker in speakers)
    pattern = rf'(?:Q - |A - )?({speaker_pattern})(?:\s*{{BIO \d+ <GO>}})?\n((?:(?!\n(?:Q - |A - )?(?:{speaker_pattern})(?:\s*{{BIO \d+ <GO>}})?).)*)'


    result = []
    for match in re.finditer(pattern, content, re.DOTALL | re.MULTILINE):
        speaker = match.group(1).strip()
        text_chunk = match.group(2).strip()
        if not text_chunk or text_chunk == '':
            continue
        if speaker != 'Operator':
            result.append((speaker, text_chunk))
    
    return result

In [6]:
def extract_date(line: str) -> Optional[str]:
    date_pattern = r'(\d{4}-\d{2}-\d{2})'
    match = re.search(date_pattern, line)
    
    if match:
        date_str = match.group(1)
        date_of_transcript = datetime.strptime(date_str, '%Y-%m-%d')

        return date_of_transcript.strftime('%B %d, %Y')
    else:
        logging.error("No date of transcript is extracted")
        return None

In [7]:
def extract_prefix(line: str) -> str:
    date_pattern = r'(\w+ \d{1,2}, \d{4}) (\d{1,2}:\d{2} [APM]{2})'
    date_match = re.search(date_pattern, line)

    if date_match:
        date_str = date_match.group(1)
        date_format = "%B %d, %Y"
        date_obj = datetime.strptime(date_str, date_format)
        year = date_obj.year
        month = date_obj.month
        date_prefix = f"{year}{month:02d}"
    else:
        logging.error("No date found")

    if "Earning" in line or "Earnings" in line:
        quarter_pattern = r'Q(\d) (\d{4})'
        quarter_match = re.search(quarter_pattern, line)

        if quarter_match:
            quarter = quarter_match.group(1)
            year = quarter_match.group(2)
            quarter_prefix = f"{year}Q{quarter}"
        else:
            logging.error("No fiscal quarter found")
            return None
        
        return f"{date_prefix} - {quarter_prefix} - "
    
    return f"{date_prefix} - "

In [8]:
def extract_ticker(file_name) -> str:
    # Define a regex pattern to match the ticker inside parentheses
    ticker_pattern = r'\((.*?)\)'
    match = re.search(ticker_pattern, file_name)
    
    if match:
        ticker = match.group(1)
        return ticker
    else:
        logging.error("No stock ticker is extracted")
        return None

In [9]:
def extract_quarter(file_name) -> str:
    pattern = r'Q\d 2\d{3}'
    match = re.search(pattern, file_name)
    if match:
        return match.group()
    else:
        logging.error("No fiscal quarter is extracted")
        return None

In [10]:
client = AzureOpenAI(
  azure_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"],
  api_key = os.environ["AZURE_OPENAI_API_KEY"],
  api_version = os.environ["API_VERSION"],
)

def generate_embeddings(text, model="text-embedding-3-large"): 
    return client.embeddings.create(input = [text], model=model).data[0].embedding

In [48]:
def preprocess_transcript(file_path):
    with open(file_path, 'r') as file:
        file_content = file.read()

    file_name = os.path.basename(file_path)

    speakers, position_dict = extract_speakers(file_content)
    speakers.append('Operator')
    print(speakers)

    chunks = split_content_by_speakers(file_content, speakers)
    print(chunks)

    date_of_transcript = extract_date(file_content)

    stock_ticker = extract_ticker(file_content)

    transcript_type = 'Earnings call' if 'Earnings' in file_name else 'Others'
    fiscal_quarter = extract_quarter(file_content) if transcript_type == 'Earnings call' else None

    processed_chunks = []
    for i, (speaker, text_chunk) in enumerate(chunks):
        lines = text_chunk.split('\n')
        content = ' '.join(line.strip() for line in lines)
        position = position_dict[speaker]
        chunk_number = f"Chunk {i}"
        processed_chunks.append({
            'id': str(i+1),
            'content': content,
            'speaker': speaker,
            'position': position,
            'date_of_transcript': date_of_transcript,
            'stock_ticker': stock_ticker,
            'transcript_type': transcript_type,
            'fiscal_quarter': fiscal_quarter,
            # 'transcript_name': f"{extract_prefix(first_line)}{file_name}",
            "chunk_number": chunk_number,
        })

    return processed_chunks


# Preprocess the transcript
processed_transcript = preprocess_transcript(file_path)

for chunk in processed_transcript:
    print(chunk)

['Kyunghoon Kim', 'Michael Son', 'Unidentified Speaker', 'Yoontae Kim', 'Young No KWON', 'Gang Ho Park', 'Ji-San Kim', 'Jung Hoon Chang', 'Sung Kyu Kim', 'Woo-Hyung Cho', 'Operator']
[('Kyunghoon Kim', 'Michael Son , Head, Electronic Material Strategic Marketing Team\n   Unidentified Speaker\n   Yoontae Kim , Vice President, Business Management Office\n  Young No KWON , Executive Vice President\nOther Participants\n  Gang Ho Park, Analyst, Daishin Securities\n   Ji-San Kim , Analyst, Kiwoom Securities\n  Jung Hoon Chang , Analyst, Samsung Securities\n  Sung Kyu Kim , Analyst, Daiwa Securities\n  Woo-Hyung Cho , Analyst, HSBC Securities\nPresentation'), ('Yoontae Kim', "Good afternoon. This is Yoontae Kim, Vice President of the Business Management Office at\nSamsung SDI. Before we begin, I would like to introduce our management team attending\ntoday's conference call. Our CFO, Young-No Kwon; Head of the Battery Strategic Marketing\nTeam, Michael Son; Head of the Electronic Materials Str

In [37]:
new_processed_transcript = preprocess_transcript("..\Data Scraping\AAPL\Earnings Call Transcripts\Apple Inc. (AAPL) CEO Tim Cook on Q1 2019 Results - Earnings Call Transcript.txt")

for chunk in new_processed_transcript:
    print(chunk)

{'id': '1', 'content': 'Conference Call Participants', 'speaker': 'Nancy Paxton', 'position': 'Insider', 'date_of_transcript': 'January 30, 2019 06:00 AM +08', 'stock_ticker': ' NASDAQ: AAPL ', 'transcript_type': 'Earnings call', 'fiscal_quarter': 'Q1 2019', 'chunk_number': 'Chunk 0'}
{'id': '2', 'content': "Thank you. Good afternoon and thanks to everyone for joining us. Speaking first today is Apple CEO, Tim Cook, and he'll be followed by CFO, Luca Maestri. After that, we'll open the call to questions from analysts. Please note that some of the information you'll hear during our discussion today will consist of forward-looking statements, including without limitation those regarding revenue, gross margin, operating expenses, other income and expense, taxes, capital allocation, and future business outlook. Actual results or trends could differ materially from our forecast. For more information, please refer to the Risk Factors discussed in Apple's most recently filed periodic reports 

In [None]:
service_endpoint  = os.environ["AZURE_AI_SEARCH_SERVICE_ENDPOINT"]
key = os.environ["AZURE_AI_SEARCH_KEY"]
index_name = "seeking_alpha"

search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))

# Create a batch to upload documents
batch = IndexDocumentsBatch()
batch.add_upload_actions(processed_transcript)

# Upload the batch to Azure Search
result = search_client.index_documents(batch)

print(result)

In [None]:
results = search_client.search(search_text="battery replacement program", include_total_count=True)


for result in results:
    print(result)
