In [None]:
# First, you need to install the library for the Gemini API.
# Open your terminal or command prompt and run:
# pip install google-generativeai

import google.generativeai as genai
import os # Import the 'os' library to access environment variables
import time # Import the 'time' library to add delays

# --- Configuration ---
# Load the API key from an environment variable for better security.
API_KEY = os.getenv("GEMINI_API_KEY")

if not API_KEY:
    print("Error: GEMINI_API_KEY environment variable not set.")
    print("Please set the environment variable before running the script.")
    exit()

try:
    genai.configure(api_key=API_KEY)
except Exception as e:
    print(f"Error configuring API. Please ensure your API key is valid. Error: {e}")
    exit()

# --- UPGRADED: A More Sophisticated Prompt with a Wider Variety of Examples ---
prompt = """
You are an expert financial analyst. Your task is to classify a chunk of text from a company's filing into one of the following categories:
- RISK_TITLE: A short, distinct heading for a specific forward-looking risk.
- RISK_EXPLANATION: The paragraph(s) that explain the risk mentioned in the title.
- FINANCIAL_SUMMARY: A backward-looking discussion of financial performance, revenues, costs, or operations.
- BOILERPLATE: Generic, non-informative text, often legal disclaimers.
- OTHER: Any text that does not fit the above categories, such as a table of contents line or a page number.

Here are some high-quality, diverse examples:

Text: 'WE HAVE GENERATED SIGNIFICANT LOSSES IN RECENT PERIODS.'
Label: RISK_TITLE

Text: 'We incurred significant net losses during the three years prior to 2004.'
Label: RISK_EXPLANATION

Text: 'Our Acquisitions May Not Perform as Anticipated'
Label: RISK_TITLE

Text: 'We have completed many acquisitions of self-storage facilities since our initial public offering of common stock in June 1995.'
Label: RISK_EXPLANATION

Text: 'RESTRICTIVE COVENANTS IN OUR DEBT INSTRUMENTS COULD LIMIT OUR FINANCIAL AND OPERATING FLEXIBILITY AND SUBJECT US TO OTHER RISKS.'
Label: RISK_TITLE

Text: 'The agreements governing our indebtedness include certain covenants that restrict, among other things, our ability to: - incur additional debt; - pay dividends on our common stock in excess of 10 million per year;'
Label: RISK_EXPLANATION

Text: 'Results of Operations'
Label: FINANCIAL_SUMMARY

Text: 'Net income for the year ended December 31, 2004 was 62.1 million compared to 49.1 million for year ended December 31, 2003.'
Label: FINANCIAL_SUMMARY

Text: 'You should carefully consider the risks described below, together with all of the other information included in or incorporated by reference into our Form 10-K, as part of your evaluation of the Company.'
Label: BOILERPLATE

Text: 'Item 1A. Risk Factors'
Label: OTHER

---
Now, classify the following text. Respond with only ONE of the labels: RISK_TITLE, RISK_EXPLANATION, FINANCIAL_SUMMARY, BOILERPLATE, or OTHER.

Text: '{text_to_classify}'
Label: 
"""

def classify_chunk(text_chunk):
    """
    Uses the Gemini API with a multi-class prompt to classify a text chunk.

    Args:
        text_chunk (str): The piece of text to classify.

    Returns:
        str: The predicted label.
    """
    # --- MODEL CHANGE: Using a faster model with a higher rate limit ---
    model = genai.GenerativeModel('gemini-2.5-flash') 
    
    try:
        formatted_prompt = prompt.format(text_to_classify=text_chunk)
        response = model.generate_content(formatted_prompt)
        return response.text.strip()
        
    except Exception as e:
        print(f"An error occurred during the API call: {e}")
        return "API_ERROR"

# --- Example Usage ---
if __name__ == "__main__":
    
    test_chunks = [
        "OUR INVENTORIES ARE NOT MANAGED BY PERPETUAL INVENTORY CONTROL SYSTEMS.", # Should be RISK_TITLE
        "The systems and processes we use to manage and value our inventories require significant manual intervention.", # Should be RISK_EXPLANATION
        "Gross profit of 210 million in 2004 represented a 42 million, or 25 , increase from gross profit of 168 million in 2003.", # Should be FINANCIAL_SUMMARY
        "In addition to the other information in this Form 10-K, the following factors should be considered in evaluating our company and our business.", # Should be BOILERPLATE
        "Supplier Risks", # Should be RISK_TITLE
        "Our success is dependent on the performance of our vendors and service providers.", # Should be RISK_TITLE
    ]

    print("--- Running AI Multi-Class Classification on Test Chunks ---")
    for chunk in test_chunks:
        label = classify_chunk(chunk)
        print(f"\nText:  '{chunk}'")
        print(f"AI Label: -> {label} <-")
        
        # --- BEST PRACTICE: Add a small delay to respect API rate limits ---
        # A 1-second delay allows for up to 60 requests per minute.
        time.sleep(1)



--- Running AI Multi-Class Classification on Test Chunks ---

Text:  'OUR INVENTORIES ARE NOT MANAGED BY PERPETUAL INVENTORY CONTROL SYSTEMS.'
AI Label: -> RISK_TITLE <-

Text:  'The systems and processes we use to manage and value our inventories require significant manual intervention.'
AI Label: -> RISK_EXPLANATION <-

Text:  'Gross profit of 210 million in 2004 represented a 42 million, or 25 , increase from gross profit of 168 million in 2003.'
AI Label: -> FINANCIAL_SUMMARY <-

Text:  'In addition to the other information in this Form 10-K, the following factors should be considered in evaluating our company and our business.'
AI Label: -> BOILERPLATE <-

Text:  'Supplier Risks'
AI Label: -> RISK_TITLE <-

Text:  'Our success is dependent on the performance of our vendors and service providers.'
AI Label: -> RISK_TITLE <-


In [2]:
# This script combines file loading, AI classification, and intelligent structuring.
import google.generativeai as genai
import os
import time
import glob
import pickle
import pandas as pd
import re
import json

# --- Configuration & API Setup ---
API_KEY = os.getenv("GEMINI_API_KEY")
if not API_KEY:
    print("Error: GEMINI_API_KEY environment variable not set.")
    exit()
try:
    genai.configure(api_key=API_KEY)
except Exception as e:
    print(f"Error configuring API: {e}")
    exit()

# --- UPGRADED: AI Prompt with instructions for handling lists ---
prompt = """
You are an expert financial analyst. Your task is to classify a chunk of text from a company's filing into one of the following categories:
- RISK_TITLE: A short, distinct heading for a specific forward-looking risk.
- RISK_EXPLANATION: The paragraph(s) that explain the risk mentioned in the title. This INCLUDES bullet points or lists that follow an introductory sentence.
- FINANCIAL_SUMMARY: A backward-looking discussion of financial performance, revenues, costs, or operations.
- BOILERPLATE: Generic, non-informative text, often legal disclaimers.
- OTHER: Any text that does not fit the above categories, such as a table of contents line or a page number.

Here are some high-quality, diverse examples:

Text: 'WE HAVE GENERATED SIGNIFICANT LOSSES IN RECENT PERIODS.'
Label: RISK_TITLE

Text: 'We incurred significant net losses during the three years prior to 2004.'
Label: RISK_EXPLANATION

Text: 'Our Acquisitions May Not Perform as A nticipated'
Label: RISK_TITLE

Text: 'We have completed many acquisitions of self-storage facilities since our initial public offering of common stock in June 1995.'
Label: RISK_EXPLANATION

Text: 'The agreements governing our indebtedness include certain covenants that restrict, among other things, our ability to:'
Label: RISK_EXPLANATION

Text: '- incur additional debt; - pay dividends on our common stock in excess of 10 million per year; - repurchase our common stock;'
Label: RISK_EXPLANATION

Text: 'Results of Operations'
Label: FINANCIAL_SUMMARY

Text: 'Net income for the year ended December 31, 2004 was 62.1 million compared to 49.1 million for year ended December 31, 2003.'
Label: FINANCIAL_SUMMARY

Text: 'You should carefully consider the risks described below, together with all of the other information included in or incorporated by reference into our Form 10-K, as part of your evaluation of the Company.'
Label: BOILERPLATE

Text: 'Item 1A. Risk Factors'
Label: OTHER
---
Now, classify the following text. Respond with only ONE of the labels.
Text: '{text_to_classify}'
Label: 
"""

def classify_chunk_with_retry(text_chunk, max_retries=3, delay=5):
    """Uses the Gemini API to classify a text chunk, with retry logic for rate limits."""
    model = genai.GenerativeModel('gemini-2.5-flash')
    for attempt in range(max_retries):
        try:
            formatted_prompt = prompt.format(text_to_classify=text_chunk)
            response = model.generate_content(formatted_prompt)
            # A small delay to help stay under the rate limit
            time.sleep(2) 
            return response.text.strip()
        except Exception as e:
            print(f"  > API Error: {e}. Retrying in {delay}s... (Attempt {attempt + 1}/{max_retries})")
            time.sleep(delay)
    return "API_FAILURE"

def process_filing_to_json(filing_series, output_filename="structured_risk_factors.json"):
    """
    Takes a single filing (as a pandas Series), processes its risk factors using AI,
    structures the result, and saves it as a JSON file.
    """
    raw_text = filing_series.get('rf', '')
    if not raw_text or not isinstance(raw_text, str):
        print("No valid risk factor text found in this filing.")
        return

    # 1. Split the raw text into processable chunks (paragraphs)
    chunks = re.split(r'\n\s*\n', raw_text.strip())
    cleaned_chunks = [ ' '.join(c.split()) for c in chunks if c.strip() and not c.strip().isdigit() ]

    # 2. Get AI-powered labels for each chunk
    print(f"\nClassifying {len(cleaned_chunks)} text chunks with Gemini API...")
    labeled_chunks = []
    for i, chunk in enumerate(cleaned_chunks):
        label = classify_chunk_with_retry(chunk)
        print(f"  Chunk {i+1}/{len(cleaned_chunks)} classified as: {label}")
        labeled_chunks.append({'text': chunk, 'label': label})

    # 3. Structure the data by "fusing" explanations to their titles
    print("\nStructuring the labeled data...")
    structured_risks = []
    current_risk = None
    for item in labeled_chunks:
        label = item['label']
        text = item['text']

        if label == 'RISK_TITLE':
            # If a new title is found, the previous risk is complete.
            if current_risk:
                structured_risks.append(current_risk)
            # Start a new risk item.
            current_risk = {'title': text, 'explanation': []}
        
        elif label == 'RISK_EXPLANATION':
            # If this is an explanation, append it to the current risk item.
            if current_risk:
                current_risk['explanation'].append(text)
            # If an explanation appears before any title, create a risk item without a title.
            else:
                current_risk = {'title': 'Uncategorized', 'explanation': [text]}


    # After the loop, add the last risk item that was being built.
    if current_risk:
        structured_risks.append(current_risk)

    # Join the list of explanation paragraphs into a single string for each risk
    for risk in structured_risks:
        risk['explanation'] = ' '.join(risk['explanation'])

    # 4. Save the final structured data to a JSON file
    final_output = {
        'cik': str(filing_series.get('cik', 'N/A')),
        'date': str(filing_series.get('date', 'N/A')),
        'structured_risks': structured_risks
    }
    
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(final_output, f, indent=4)
        
    print(f"\nSuccess! Structured risk data has been saved to '{output_filename}'")
    print("You can now open this JSON file to see the clean, machine-readable output.")

# --- Main Execution Block ---
if __name__ == "__main__":
    text_data_path = r'C:\_Files\Personal\Projects\FIAM\FIAM2025\data\text_data'
    
    # Find and load the first pickle file
    pickle_files = glob.glob(os.path.join(text_data_path, '**', '*.pkl'), recursive=True)
    if not pickle_files:
        print("No pickle files found.")
        exit()
        
    print(f"Loading data from: {os.path.basename(pickle_files[0])}")
    with open(pickle_files[0], 'rb') as f:
        filings_df = pickle.load(f)

    # Find the first valid filing to process
    valid_filing = None
    for _, row in filings_df.iterrows():
        if pd.notna(row.get('rf')) and len(str(row.get('rf')).strip()) > 100:
            valid_filing = row
            break
            
    if valid_filing is not None:
        process_filing_to_json(valid_filing)
    else:
        print("Could not find a valid filing with risk factors in the first pickle file.")



Loading data from: text_us_2005.pkl


  filings_df = pickle.load(f)



Classifying 57 text chunks with Gemini API...
  Chunk 1/57 classified as: OTHER
  Chunk 2/57 classified as: BOILERPLATE
  Chunk 3/57 classified as: OTHER
  Chunk 4/57 classified as: RISK_TITLE
  Chunk 5/57 classified as: RISK_EXPLANATION
  Chunk 6/57 classified as: RISK_TITLE
  Chunk 7/57 classified as: RISK_EXPLANATION
  Chunk 8/57 classified as: RISK_TITLE
  Chunk 9/57 classified as: RISK_EXPLANATION
  Chunk 10/57 classified as: RISK_TITLE
  Chunk 11/57 classified as: RISK_EXPLANATION
  > API Error: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_v

In [None]:
# This script combines file loading, AI classification, and intelligent structuring.
import google.generativeai as genai
import os
import time
import glob
import pickle
import pandas as pd
import re
import json

# --- Configuration & API Setup ---
API_KEY = os.getenv("GEMINI_API_KEY")
if not API_KEY:
    print("Error: GEMINI_API_KEY environment variable not set.")
    exit()
try:
    genai.configure(api_key=API_KEY)
except Exception as e:
    print(f"Error configuring API: {e}")
    exit()

# --- UPGRADED: AI Prompt with instructions for handling lists ---
prompt = """
You are an expert financial analyst. Your task is to classify a chunk of text from a company's filing into one of the following categories:
- RISK_TITLE: A short, distinct heading for a specific forward-looking risk.
- RISK_EXPLANATION: The paragraph(s) that explain the risk mentioned in the title. This INCLUDES bullet points or lists that follow an introductory sentence.
- FINANCIAL_SUMMARY: A backward-looking discussion of financial performance, revenues, costs, or operations.
- BOILERPLATE: Generic, non-informative text, often legal disclaimers.
- OTHER: Any text that does not fit the above categories, such as a table of contents line or a page number.

Here are some high-quality, diverse examples:

Text: 'WE HAVE GENERATED SIGNIFICANT LOSSES IN RECENT PERIODS.'
Label: RISK_TITLE

Text: 'We incurred significant net losses during the three years prior to 2004.'
Label: RISK_EXPLANATION

Text: 'Our Acquisitions May Not Perform as A nticipated'
Label: RISK_TITLE

Text: 'We have completed many acquisitions of self-storage facilities since our initial public offering of common stock in June 1995.'
Label: RISK_EXPLANATION

Text: 'The agreements governing our indebtedness include certain covenants that restrict, among other things, our ability to:'
Label: RISK_EXPLANATION

Text: '- incur additional debt; - pay dividends on our common stock in excess of 10 million per year; - repurchase our common stock;'
Label: RISK_EXPLANATION

Text: 'Results of Operations'
Label: FINANCIAL_SUMMARY

Text: 'Net income for the year ended December 31, 2004 was 62.1 million compared to 49.1 million for year ended December 31, 2003.'
Label: FINANCIAL_SUMMARY

Text: 'You should carefully consider the risks described below, together with all of the other information included in or incorporated by reference into our Form 10-K, as part of your evaluation of the Company.'
Label: BOILERPLATE

Text: 'Item 1A. Risk Factors'
Label: OTHER
---
Now, classify the following text. Respond with only ONE of the labels.
Text: '{text_to_classify}'
Label: 
"""

def classify_chunk_with_retry(text_chunk, max_retries=5, delay=8):
    """Uses the Gemini API to classify a text chunk, with retry logic for rate limits."""
    model = genai.GenerativeModel('gemini-2.5-flash')
    for attempt in range(max_retries):
        try:
            formatted_prompt = prompt.format(text_to_classify=text_chunk)
            response = model.generate_content(formatted_prompt)
            # A small delay to help stay under the rate limit
            time.sleep(5) 
            return response.text.strip()
        except Exception as e:
            print(f"  > API Error: {e}. Retrying in {delay}s... (Attempt {attempt + 1}/{max_retries})")
            time.sleep(delay)
    return "API_FAILURE"

def process_filing_to_json(filing_series, output_dir):
    """
    Takes a single filing, processes its risk factors using AI,
    and saves the structured result to a uniquely named JSON file in the output directory.
    """
    raw_text = filing_series.get('rf', '')
    cik = str(filing_series.get('cik', 'unknown_cik'))
    date = str(filing_series.get('date', 'unknown_date'))
    output_filename = f"{cik}_{date}.json"
    output_path = os.path.join(output_dir, output_filename)

    # Skip if the file already exists to allow for resuming
    if os.path.exists(output_path):
        print(f"  > Skipping, output file already exists: {output_filename}")
        return

    if not raw_text or not isinstance(raw_text, str) or len(raw_text.strip()) < 100:
        print("  > Skipping, no valid risk factor text found.")
        return

    # 1. Split the raw text into processable chunks (paragraphs)
    chunks = re.split(r'\n\s*\n', raw_text.strip())
    cleaned_chunks = [ ' '.join(c.split()) for c in chunks if c.strip() and not c.strip().isdigit() ]

    # 2. Get AI-powered labels for each chunk
    print(f"  > Classifying {len(cleaned_chunks)} text chunks with Gemini API...")
    labeled_chunks = []
    for i, chunk in enumerate(cleaned_chunks):
        label = classify_chunk_with_retry(chunk)
        # No need to print every chunk, just progress
        # print(f"  Chunk {i+1}/{len(cleaned_chunks)} classified as: {label}")
        labeled_chunks.append({'text': chunk, 'label': label})

    # 3. Structure the data by "fusing" explanations to their titles
    structured_risks = []
    current_risk = None
    for item in labeled_chunks:
        label = item['label']
        text = item['text']

        if label == 'RISK_TITLE':
            if current_risk:
                structured_risks.append(current_risk)
            current_risk = {'title': text, 'explanation': []}
        
        elif label == 'RISK_EXPLANATION':
            if current_risk:
                current_risk['explanation'].append(text)
            else:
                current_risk = {'title': 'Uncategorized', 'explanation': [text]}

    if current_risk:
        structured_risks.append(current_risk)

    for risk in structured_risks:
        risk['explanation'] = ' '.join(risk['explanation'])

    # 4. Save the final structured data to a JSON file
    final_output = {
        'cik': cik,
        'date': date,
        'structured_risks': structured_risks
    }
    
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(final_output, f, indent=4)
        
    print(f"  > Success! Structured data saved to '{output_filename}'")

# --- Main Execution Block ---
if __name__ == "__main__":
    text_data_path = r'C:\_Files\Personal\Projects\FIAM\FIAM2025\data\text_data'
    output_dir = "structured_json_outputs"
    
    print(f"Output will be saved to the '{output_dir}' directory.")
    os.makedirs(output_dir, exist_ok=True)
    
    pickle_files = glob.glob(os.path.join(text_data_path, '**', '*.pkl'), recursive=True)
    if not pickle_files:
        print("No pickle files found.")
        exit()
        
    for pkl_file_path in pickle_files:
        print(f"\n--- Loading Pickle File: {os.path.basename(pkl_file_path)} ---")
        try:
            with open(pkl_file_path, 'rb') as f:
                filings_df = pickle.load(f)

            if filings_df.empty:
                print("  > DataFrame is empty, skipping file.")
                continue
            
            # Process every row in the dataframe
            for index, row in filings_df.iterrows():
                 print(f"\nProcessing filing index {index}...")
                 process_filing_to_json(row, output_dir)

        except Exception as e:
            print(f"An error occurred processing {os.path.basename(pkl_file_path)}: {e}")

