This file: Extract investment-related text from ccalll transcript

Input: folder 0_ccall: conference call transcripts

Output:
- 0_sentences.csv: transcript split into sentence level
- 0_investment_sentences.csv: sentences related to investment

In [13]:
import os
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
import re

Notice: If you're using colab, run the following two cells

In [7]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [9]:
os.chdir('/content/drive/MyDrive/Colab Notebooks/acct4_ta_s1')

Split txt into sentence level

In [11]:
# Download the punkt tokenizer models if needed
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Get all txt files in the ccall folder
txt_files = [f for f in os.listdir('0_ccall') if f.endswith('.txt')]

# Initialize a list to store all sentences
all_sentences = []
sentence_id = 0

# Process each file
for file_name in txt_files:
    file_path = os.path.join('0_ccall', file_name)

    # Read the file content
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Extract date from row 7 (index 6)
    lines = text.split('\n')
    datadate = lines[6] if len(lines) > 6 else ""
    datadate = datadate.strip()
    if datadate and " / " in datadate:
        datadate = datadate.split(" / ")[0]

    # Split the text into sentences
    sentences = sent_tokenize(text)

    # Add each sentence to our list with its metadata
    for sentence in sentences:
        sentence_id += 1
        all_sentences.append({
            'file_name': file_name,
            'sentence_id': sentence_id,
            'datadate': datadate,
            'sentence': sentence
        })

# Create a DataFrame
df = pd.DataFrame(all_sentences)

# Drop rows where sentence_id is missing
df = df.dropna(subset=['sentence_id'])

# Save as dta file
df.to_csv('0_sentences.csv', encoding='utf-8', index=False)
print(f"Created {len(all_sentences)} sentence entries from {len(txt_files)} files")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Created 23175 sentence entries from 49 files


Extract sentences related to investment

In [14]:
# Filter the dataframe for sentences that mention investment-related terms
KEYWORDS = [
    r'\binvest(?:s|ing|ed|ment|or)?\b',
    r'\bcapex\b', r'\bcapx\b',
    r'\bcapital (?:expenditure|spending)s?\b',
    r'\bR&D\b', r'\bresearch and development\b',
    r'\bexpenditures?\b', r'\bspending\b', r'\bbudget allocation\b',
    r'\bfinancial commitment\b', r'\bdollar commitment\b',
    r'\$[0-9]+ (?:million|billion)',
    r'\bcash injection\b', r'\bfunds?\b', r'\binfrastructure\b',
    r'\bexpansion\b', r'\bbuild-?out\b', r'\bdevelopment funding\b',
    r'\bportfolio\b', r'\bstrategic investment\b', r'\blong-term investment\b',
]

PATTERN = re.compile(r'(?i)(?:' + '|'.join(KEYWORDS) + r')')
investment_df = df[df['sentence'].str.contains(PATTERN)]

# Show the result
print(f"Found {len(investment_df)} sentences related to investments")
investment_df.to_csv('0_investment_sentences.csv', encoding='utf-8', index=False)

Found 1333 sentences related to investments
