In [6]:
import requests
import os
import json
from dotenv import load_dotenv
from datetime import datetime
import time
import re

load_dotenv()
API_KEY = os.getenv("API_NINJA_API_KEY")
BASE_URL = "https://api.api-ninjas.com/v1/earningstranscript"
OUTPUT_DIR = "transcripts"

In [7]:
ticker_list = ["MSFT"]
current_year = datetime.now().year
years = [current_year, current_year - 1]
quarters = [1, 2, 3, 4]

In [8]:
def fetch_transcript(ticker, year, quarter, api_key):
    """
    Fetch the earnings transcript for a given ticker, year, and quarter.
    
    Parameters:
        ticker (str): Company ticker symbol (e.g., "MSFT").
        year (int): Earnings year (e.g., 2024).
        quarter (int): Earnings quarter (1, 2, 3, or 4).
        api_key (str): API key for authentication.
    
    Returns:
        dict or None: Transcript data if available; otherwise, None.
    """
    params = {"ticker": ticker, "year": year, "quarter": quarter}
    headers = {"X-Api-Key": api_key}
    
    try:
        response = requests.get(BASE_URL, params=params, headers=headers)
        if response.status_code != 200:
            print(f"HTTP error {response.status_code} for {ticker} - {year} Q{quarter}")
            return None
        
        data = response.json()
        
        # Check if the API returned an error message (e.g., premium-only error)
        if isinstance(data, dict) and "error" in data:
            print(f"API error for {ticker} - {year} Q{quarter}: {data['error']}")
            return None
        
        # If transcript is not available yet, the API returns an empty list
        if data == []:
            print(f"No transcript available for {ticker} - {year} Q{quarter}.")
            return None
        
        return data
    except Exception as e:
        print(f"Exception occurred for {ticker} - {year} Q{quarter}: {e}")
        return None

In [9]:
def parse_transcript(transcript_text):
    """
    Parse a transcript string into a list of speaker entries.
    
    Assumes the transcript follows the format where each speaker's turn is marked 
    by a speaker name followed immediately by a colon (":") at the beginning of a line
    or the transcript, e.g.:
    
        Operator: Welcome to the call.
        Brett Iversen: Thank you for joining.
        Satya Nadella: Let's start with the numbers...
    
    Parameters:
        transcript_text (str): The full transcript as one large string.
    
    Returns:
        list of dict: A list where each element is a dictionary with two keys:
            - 'speaker': The name of the speaker.
            - 'text': The text spoken by that speaker.
    """
    # Use a regex pattern that only matches speaker markers at the start of a line or string.
    # This helps avoid matching colons inside the dialogue.
    pattern = r'(?:^|\n)(?P<speaker>[^:\n]+):'
    
    # Find all speaker markers and their positions in the transcript text.
    matches = list(re.finditer(pattern, transcript_text))
    parsed_entries = []
    
    for i, match in enumerate(matches):
        speaker = match.group('speaker').strip()
        # The dialogue starts right after the colon.
        start = match.end()
        # If there's another speaker later, slice until that speaker's start; otherwise, go to end of text.
        end = matches[i+1].start() if i < len(matches) - 1 else len(transcript_text)
        dialogue = transcript_text[start:end].strip()
        parsed_entries.append({"speaker": speaker, "text": dialogue})
    
    return parsed_entries

In [10]:
# Ensure the output directory exists.
os.makedirs(OUTPUT_DIR, exist_ok=True)

for ticker in ticker_list:
    for year in years:
        for quarter in quarters:
            # Build the filename using TICKER_YEAR_QQUARTER_transcript.json format.
            file_name = f"{ticker}_{year}_Q{quarter}_transcript.json"
            file_path = os.path.join(OUTPUT_DIR, file_name)
            
            # Check if transcript already exists (caching).
            if os.path.exists(file_path):
                print(f"Transcript already exists for {ticker} - {year} Q{quarter}. Skipping API call.")
                continue
            
            print(f"Processing {ticker} - {year} Q{quarter}...")
            transcript_data = fetch_transcript(ticker, year, quarter, API_KEY)
            
            if transcript_data:
                # Parse the transcript text if available.
                transcript_text = transcript_data.get("transcript", "")
                transcript_data["raw_transcript"] = transcript_text
                transcript_data["transcript"] = parse_transcript(transcript_text)
                
                # Save the transcript data to file in JSON format.
                with open(file_path, "w") as f:
                    json.dump(transcript_data, f, indent=4)
                print(f"Transcript saved: {file_path}")
            
            time.sleep(0.15)

print("Transcript retrieval complete.")

Transcript already exists for MSFT - 2025 Q1. Skipping API call.
Transcript already exists for MSFT - 2025 Q2. Skipping API call.
Processing MSFT - 2025 Q3...
No transcript available for MSFT - 2025 Q3.
Processing MSFT - 2025 Q4...
No transcript available for MSFT - 2025 Q4.
Transcript already exists for MSFT - 2024 Q1. Skipping API call.
Transcript already exists for MSFT - 2024 Q2. Skipping API call.
Transcript already exists for MSFT - 2024 Q3. Skipping API call.
Transcript already exists for MSFT - 2024 Q4. Skipping API call.
Transcript retrieval complete.
