In [None]:
import pdfplumber
import re
import nltk
from nltk.tokenize import sent_tokenize
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline, AutoTokenizer

# Step 1: Extract Text from PDF

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file using pdfplumber.
    :param pdf_path: Path to the PDF file.
    :return: Extracted text as a string.
    """
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# Define the file path of the Disney 10-K report
pdf_path = "/Users/pengxue/Documents/GitHub/MGTF423_group2/2024-Annual-Report.pdf"
disney_text = extract_text_from_pdf(pdf_path)

# Preview the first 1000 characters to verify text extraction
print(disney_text[:1000])



12/2023
JM
FISCAL YEAR 2024 ANNUAL FINANCIAL REPORT

UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the fiscal year ended September 28, 2024
or
☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the transition period from __________ to __________.
Commission File Number 001-38842
Delaware 83-0940635
State or Other Jurisdiction of I.R.S. Employer Identification
Incorporation or Organization
500 South Buena Vista Street
Burbank, California 91521
Address of Principal Executive Offices and Zip Code
(818) 560-1000
Registrant’s Telephone Number, Including Area Code
Securities registered pursuant to Section 12(b) of the Act:
Title of each class Trading Symbol(s) Name of each exchange on which registered
Common Stock, $0.01 par value DIS New York Stock Exchange
Securities Registered Pursuant to Section 12(g) of the Act: None.


In [None]:

# Step 2: Text Preprocessing

def clean_text(text):
    """
    Cleans the extracted text by removing unnecessary spaces and special characters.
    :param text: Raw text extracted from PDF.
    :return: Cleaned text.
    """
    text = re.sub(r'\n+', ' ', text)  # Replace multiple newlines with a space
    text = re.sub(r'\s+', ' ', text)  # Remove excessive whitespace
    text = re.sub(r'[^A-Za-z0-9.,$%-]', ' ', text)  # Keep only letters, numbers, and relevant punctuation
    return text.strip()

# Apply text cleaning
cleaned_text = clean_text(disney_text)

# Split text into sentences using NLTK
sentences = sent_tokenize(cleaned_text)

# Preview the first few sentences
print(sentences[:5])


['12 2023 JM FISCAL YEAR 2024 ANNUAL FINANCIAL REPORT UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549 FORM 10-K   ANNUAL REPORT PURSUANT TO SECTION 13 OR 15 d  OF THE SECURITIES EXCHANGE ACT OF 1934 For the fiscal year ended September 28, 2024 or   TRANSITION REPORT PURSUANT TO SECTION 13 OR 15 d  OF THE SECURITIES EXCHANGE ACT OF 1934 For the transition period from            to           .', 'Commission File Number 001-38842 Delaware 83-0940635 State or Other Jurisdiction of I.R.S.', 'Employer Identification Incorporation or Organization 500 South Buena Vista Street Burbank, California 91521 Address of Principal Executive Offices and Zip Code  818  560-1000 Registrant s Telephone Number, Including Area Code Securities registered pursuant to Section 12 b  of the Act  Title of each class Trading Symbol s  Name of each exchange on which registered Common Stock, $0.01 par value DIS New York Stock Exchange Securities Registered Pursuant to Section 12 g  of the Act 

In [None]:


# Step 3: Keyword Frequency Analysis

# Define keywords for equity and debt financing
equity_keywords = ["equity financing", "common stock", "preferred stock", "share repurchase", "stock dilution"]
debt_keywords = ["long-term debt", "short-term debt", "bond issuance", "leverage ratio", "loan facility"]

def count_keywords(sentences, keywords):
    """
    Counts the occurrence of specified keywords in the given sentences.
    :param sentences: List of tokenized sentences.
    :param keywords: List of keywords to search for.
    :return: Dictionary with keyword frequencies.
    """
    keyword_count = Counter()
    for sent in sentences:
        for keyword in keywords:
            if keyword in sent.lower():
                keyword_count[keyword] += 1
    return keyword_count

# Compute keyword frequency for equity and debt-related terms
equity_counts = count_keywords(sentences, equity_keywords)
debt_counts = count_keywords(sentences, debt_keywords)

# Display keyword frequency results
print("Equity Financing Keywords:", equity_counts)
print("Debt Financing Keywords:", debt_counts)



Equity Financing Keywords: Counter({'common stock': 18, 'share repurchase': 8, 'preferred stock': 4})
Debt Financing Keywords: Counter({'leverage ratio': 3, 'long-term debt': 2, 'short-term debt': 2})


In [None]:

# Step 4: TF-IDF Analysis

# Define vocabulary for TF-IDF analysis
equity_related_vocab = ["equity", "equity investment", "preferred stock"]
debt_related_vocab = ["debt", "long term debt", "short term debt"]

# Compute TF-IDF scores
vectorizer = TfidfVectorizer(ngram_range=(1,3), analyzer="word", lowercase=True, 
                             vocabulary=equity_related_vocab + debt_related_vocab)
tfidf_matrix = vectorizer.fit_transform(sentences)

# Extract feature names
feature_names = vectorizer.get_feature_names_out()

def get_tfidf_scores(tfidf_matrix, feature_names):
    """
    Calculates the TF-IDF scores for the given vocabulary.
    :param tfidf_matrix: The TF-IDF transformed matrix.
    :param feature_names: The vocabulary used in TF-IDF.
    :return: Dictionary with TF-IDF scores.
    """
    scores = {word: tfidf_matrix[:, i].sum() for i, word in enumerate(feature_names)}
    return scores

# Compute TF-IDF scores
tfidf_scores = get_tfidf_scores(tfidf_matrix, feature_names)

# Extract scores for equity and debt-related terms
equity_scores = {k: v for k, v in tfidf_scores.items() if k in equity_related_vocab}
debt_scores = {k: v for k, v in tfidf_scores.items() if k in debt_related_vocab}

# Display TF-IDF scores
print("Equity Investment TF-IDF Scores:", equity_scores)
print("Debt Financing TF-IDF Scores:", debt_scores)



Equity Investment TF-IDF Scores: {'equity': 80.55860934749428, 'equity investment': 6.122431609197084, 'preferred stock': 0.9522264599862467}
Debt Financing TF-IDF Scores: {'debt': 31.2520914144321, 'long term debt': 1.6626255486673593, 'short term debt': 1.6626255486673596}


In [None]:

# Step 5: Sentiment Analysis

# Load a pre-trained sentiment analysis model
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
sentiment_model = pipeline("sentiment-analysis", model=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Extract sentences related to equity and debt financing
finance_related_sentences = [sent for sent in sentences if "equity" in sent.lower() or "debt" in sent.lower()]

def chunk_text_by_tokens(text, max_tokens=512):
    """
    Truncates text to fit within the model's token limit.
    :param text: Input text.
    :param max_tokens: Maximum token length.
    :return: Truncated text.
    """
    encoded = tokenizer.encode_plus(
        text, truncation=True, max_length=max_tokens, return_tensors="pt"
    )
    return tokenizer.decode(encoded["input_ids"][0], skip_special_tokens=True)

# Process finance-related sentences
chunked_sentences = [chunk_text_by_tokens(sent, max_tokens=512) for sent in finance_related_sentences]

# Batch process sentiment analysis for efficiency
sentiment_results = sentiment_model(chunked_sentences, batch_size=8)

# Count positive and negative sentiment occurrences
positive_sentiments = sum(1 for s in sentiment_results if s["label"] == "POSITIVE")
negative_sentiments = sum(1 for s in sentiment_results if s["label"] == "NEGATIVE")

# Display overall sentiment analysis results
print(f"Positive Sentiment: {positive_sentiments}")
print(f"Negative Sentiment: {negative_sentiments}")


# Step 5.2: Compare Sentiments for Equity and Debt

# Separate equity and debt-related sentences
equity_sentences = [sent for sent in finance_related_sentences if "equity" in sent.lower()]
debt_sentences = [sent for sent in finance_related_sentences if "debt" in sent.lower()]

# Process text chunks
chunked_equity_sentences = [chunk_text_by_tokens(sent, max_tokens=512) for sent in equity_sentences]
chunked_debt_sentences = [chunk_text_by_tokens(sent, max_tokens=512) for sent in debt_sentences]

# Perform sentiment analysis separately for equity and debt sentences
equity_sentiment_results = sentiment_model(chunked_equity_sentences, batch_size=8)
debt_sentiment_results = sentiment_model(chunked_debt_sentences, batch_size=8)

# Count positive and negative sentiments for equity and debt
equity_positive = sum(1 for s in equity_sentiment_results if s["label"] == "POSITIVE")
equity_negative = sum(1 for s in equity_sentiment_results if s["label"] == "NEGATIVE")

debt_positive = sum(1 for s in debt_sentiment_results if s["label"] == "POSITIVE")
debt_negative = sum(1 for s in debt_sentiment_results if s["label"] == "NEGATIVE")

# Display sentiment comparison results
print(f"Equity Sentiment: Positive: {equity_positive}, Negative: {equity_negative}")
print(f"Debt Sentiment: Positive: {debt_positive}, Negative: {debt_negative}")

Device set to use mps:0


Positive Sentiment: 27
Negative Sentiment: 91
Equity Sentiment: Positive: 23, Negative: 62
Debt Sentiment: Positive: 6, Negative: 32
