In [None]:
from langdetect import detect, DetectorFactory
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import re
from io import BytesIO
from PyPDF2 import PdfReader
import fitz
from sklearn.metrics import classification_report, f1_score, cohen_kappa_score
DetectorFactory.seed = 0

# Extract Text From PDF

In [None]:
def open_clean_pdf(path, strict=False):
    data = open(path, 'rb').read()
    start = data.find(b'%PDF-')
    end   = data.rfind(b'%%EOF')
    if start < 0 or end < 0:
        raise ValueError("Not a valid PDF (no %PDF- or %%EOF)")
    trimmed = data[start:end + len(b'%%EOF')]
    return PdfReader(BytesIO(trimmed), strict=strict)

def extract_text(pdf_path, start_page=1, end_page=None):
    """
    Crops the top area of each page and extracts text from the cropped region.
    
    Parameters:
      pdf_path (str): Path to the PDF file.
      crop_top (int): Amount to crop from the top (in points).
      start_page (int): Start extraction from this page (1-indexed).
      end_page (int or None): End extraction at this page (1-indexed). If None, process until the end.
      
    Returns:
      str: Concatenated text extracted from the cropped pages.
    """
    reader = open_clean_pdf(pdf_path)
    lines = []

    if end_page is None:
        end_page = len(reader.pages)
    
    for i in range(start_page - 1, end_page):
        page = reader.pages[i]
        text = page.extract_text()
        if text:
            for line in text.splitlines():
                lines.append(line)
    
    return lines


def split_into_sentences(text):
    """
    Splits text into sentences. The regex is adjusted to avoid splitting
    on periods used within numbers (such as percentages or decimals).
    
    Parameters:
      text (str): The text to split.
      
    Returns:
      list: A list of individual sentences.
    """
    normalized_text = " ".join(text)
    sentence_pattern = re.compile(r'(?<=[.!?])(?!\s*\d)')
    sentences = sentence_pattern.split(normalized_text)
    sentences = [s.strip() for s in sentences if s.strip()]

    english_sentences = []
    for sentence in sentences:
        try:
            if detect(sentence) == "en":
                english_sentences.append(sentence)
        except Exception:
            continue

    return "\n".join(english_sentences)


In [None]:
page_df = pd.read_csv('Page.csv')
page_df['filename'] = page_df['ticker'] + '_' + page_df['year'].astype(str)
page_df = page_df[page_df['start'].notna()]

In [None]:
for filename in page_df['filename']:
    start_page = int(page_df[page_df['filename'] == filename]['start'].values[0])
    end_page = int(page_df[page_df['filename'] == filename]['end'].values[0])
    try:
        text = extract_text(f"AnnualReport/{filename}.pdf", start_page=start_page, end_page=end_page)
    except:
        print(f"AnnualReport/{filename}.pdf")
        continue
    sentences = split_into_sentences(text)
    with open(f"Text/{filename}.txt", "w") as text_file:
        text_file.write(sentences)

In [None]:
lines = []
for filename in page_df['filename']:
    try:
        line_count = 0
        with open(f"Text/{filename}.txt", "r") as file:
            for line in file:
                line_count += 1
        lines.append(line_count)
    except:
        continue

# Create PseudoLabel from L&M Dictionary

In [None]:
lm_dict = pd.read_csv('LM_Dict_2024.csv')
lm_dict = lm_dict[(lm_dict['Positive'] > 0) | (lm_dict['Negative'] > 0) | (lm_dict['Uncertainty'] > 0) | (lm_dict['Litigious'] > 0) | (lm_dict['Strong_Modal'] > 0) | (lm_dict['Weak_Modal'] > 0) | (lm_dict['Constraining'] > 0)]
lm_dict = lm_dict[['Word', 'Positive', 'Negative', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']]
lm_dict = lm_dict.melt(id_vars=['Word'], var_name='Label', value_name='Value')
lm_dict = lm_dict[lm_dict['Value'] > 0]
lm_dict = lm_dict[['Word', 'Label']]
lm_dict = lm_dict.drop_duplicates()
lm_dict = lm_dict.reset_index(drop=True)
lm_dict.to_csv('LM_Sentiment_2024.csv', index=False)

In [9]:
lm_dict = pd.read_csv('LM_Sentiment_2024.csv')

In [19]:
from tqdm import tqdm

# Get unique labels from lm_dict
unique_labels = lm_dict['Label'].unique().tolist()

# Create an empty dataframe with columns: Sentence, Ticker, Year, plus all unique labels
all_columns = ['Sentence', 'Ticker', 'Year'] + unique_labels
text_df = pd.DataFrame(columns=all_columns)

# Iterate over the Text files with a progress bar
for filename in tqdm(page_df['filename'], desc="Processing files"):
    try:
        with open(f"Text/{filename}.txt", "r") as file:
            for line in file:
                sentence = line.strip()
                if sentence:
                    # Initialize the row with default values (0 for each label)
                    row_data = {
                        "Sentence": sentence,
                        "Ticker": filename.split('_')[0],
                        "Year": filename.split('_')[1]
                    }
                    for label in unique_labels:
                        row_data[label] = 0

                    # For each label, if a word is found, mark the label and then continue to the next label.
                    for label in unique_labels:
                        words = lm_dict[lm_dict['Label'] == label]['Word']
                        for word in words:
                            if word in sentence.upper():
                                row_data[label] = 1
                                break  # Found a word for this label, continue to next label

                    # Use pd.concat to add the row to the DataFrame
                    new_row = pd.DataFrame([row_data])
                    text_df = pd.concat([text_df, new_row], ignore_index=True)
    except Exception as e:
        print(f"Error processing file Text/{filename}.txt: {e}")
        continue

Processing files: 100%|██████████| 145/145 [17:05<00:00,  7.08s/it]


In [23]:
text_df.to_csv('Text_Database.csv', index=False, escapechar='\\')

# Evaluate Classification

In [None]:
text_df = pd.read_csv('Text_Database.csv')

In [None]:
mc_qwen = pd.read_csv('./Result/icl_multiclass_fin-r1.csv')
mc_qwen['Strong_Modal_Pred'] = mc_qwen['Strong Modal_Pred']
mc_qwen['Weak_Modal_Pred'] = mc_qwen['Weak Modal_Pred']
mc_qwen = mc_qwen.drop(columns=['Unnamed: 0', 'index', 'Strong Modal_Pred', 'Weak Modal_Pred'])
mc_qwen['Strong_Modal_Pred'] = mc_qwen['Strong_Modal_Pred'].fillna(0).astype(int)
mc_qwen['Weak_Modal_Pred'] = mc_qwen['Weak_Modal_Pred'].fillna(0).astype(int)
mc_qwen.to_csv('./Result/icl_multiclass_fin-r1.csv', index=False)

In [None]:
labels = ['Positive', 'Negative', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']
mc_fin = pd.read_csv('./Result/icl_multiclass_fin-r1.csv')
for label in labels:
    mc_fin[label + '_Pred'] = mc_fin[label + '_Pred'].replace(2, 1)

for label in labels:
    print(label)
    print(cohen_kappa_score(mc_fin[label], mc_fin[label + '_Pred']))
    print(classification_report(mc_fin[label], mc_fin[label + '_Pred'], zero_division=0))

Positive
0.28457339189741837
              precision    recall  f1-score   support

           0       0.70      0.57      0.63     38908
           1       0.59      0.72      0.65     33389

    accuracy                           0.64     72297
   macro avg       0.65      0.64      0.64     72297
weighted avg       0.65      0.64      0.64     72297

Negative
0.08650148995328555
              precision    recall  f1-score   support

           0       0.40      0.91      0.55     26522
           1       0.80      0.20      0.32     45775

    accuracy                           0.46     72297
   macro avg       0.60      0.56      0.43     72297
weighted avg       0.65      0.46      0.40     72297

Uncertainty
0.05479987376313433
              precision    recall  f1-score   support

           0       0.89      0.32      0.47     60231
           1       0.19      0.80      0.31     12066

    accuracy                           0.40     72297
   macro avg       0.54      0.56     

In [None]:
labels = ['Positive', 'Negative', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']
mc_fin = pd.read_csv('./Result/icl_multiclass_fin-r1.csv')
for label in labels:
    mc_fin[label + '_Pred'] = mc_fin[label + '_Pred'].replace(2, 0)

for label in labels:
    print(label)
    print(cohen_kappa_score(mc_fin[label], mc_fin[label + '_Pred']))
    print(classification_report(mc_fin[label], mc_fin[label + '_Pred'], zero_division=0))

Positive
0.29563906648301763
              precision    recall  f1-score   support

           0       0.68      0.65      0.67     38908
           1       0.61      0.65      0.63     33389

    accuracy                           0.65     72297
   macro avg       0.65      0.65      0.65     72297
weighted avg       0.65      0.65      0.65     72297

Negative
0.0589086576029324
              precision    recall  f1-score   support

           0       0.39      0.98      0.55     26522
           1       0.91      0.09      0.17     45775

    accuracy                           0.42     72297
   macro avg       0.65      0.54      0.36     72297
weighted avg       0.72      0.42      0.31     72297

Uncertainty
0.14977244078001772
              precision    recall  f1-score   support

           0       0.88      0.71      0.79     60231
           1       0.26      0.49      0.34     12066

    accuracy                           0.68     72297
   macro avg       0.57      0.60      

In [None]:
mc_qwen = pd.read_csv('./Result/zs_multiclass_qwen.csv')
mc_qwen['Strong_Modal_Pred'] = mc_qwen['Strong Modal_Pred']
mc_qwen['Weak_Modal_Pred'] = mc_qwen['Weak Modal_Pred']
mc_qwen = mc_qwen.drop(columns=['Unnamed: 0', 'index', 'Strong Modal_Pred', 'Weak Modal_Pred'])
mc_qwen['Strong_Modal_Pred'] = mc_qwen['Strong_Modal_Pred'].fillna(0).astype(int)
mc_qwen['Weak_Modal_Pred'] = mc_qwen['Weak_Modal_Pred'].fillna(0).astype(int)
mc_qwen.to_csv('./Result/zs_multiclass_qwen.csv', index=False)

In [None]:
from sklearn.metrics import f1_score

mc_fin = pd.read_csv('./Result/zs_multiclass_fin-r1.csv')
mc_fin['Strong_Modal_Pred'] = mc_fin['Strong Modal_Pred']
mc_fin['Weak_Modal_Pred'] = mc_fin['Weak Modal_Pred']
mc_fin = mc_fin.drop(columns=['Unnamed: 0', 'index', 'Strong Modal_Pred', 'Weak Modal_Pred'])
mc_fin['Strong_Modal_Pred'] = mc_fin['Strong_Modal_Pred'].fillna(0).astype(int)
mc_fin['Weak_Modal_Pred'] = mc_fin['Weak_Modal_Pred'].fillna(0).astype(int)
mc_fin.to_csv('./Result/zs_multiclass_fin-r1.csv', index=False)

In [None]:
labels = ['Positive', 'Negative', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']
mc_fin = pd.read_csv('./Result/zs_multiclass_fin-r1.csv')
for label in labels:
    mc_fin[label + '_Pred'] = mc_fin[label + '_Pred'].replace(2, 1)

for label in labels:
    print(label)
    print(cohen_kappa_score(mc_fin[label], mc_fin[label + '_Pred']))
    print(classification_report(mc_fin[label], mc_fin[label + '_Pred'], zero_division=0))

Positive
0.25444953566274786
              precision    recall  f1-score   support

           0       0.75      0.44      0.55     38908
           1       0.56      0.83      0.67     33389

    accuracy                           0.62     72297
   macro avg       0.65      0.63      0.61     72297
weighted avg       0.66      0.62      0.60     72297

Negative
0.16819958160837312
              precision    recall  f1-score   support

           0       0.44      0.80      0.56     26522
           1       0.77      0.40      0.53     45775

    accuracy                           0.55     72297
   macro avg       0.60      0.60      0.55     72297
weighted avg       0.65      0.55      0.54     72297

Uncertainty
0.02077795415171757
              precision    recall  f1-score   support

           0       0.88      0.16      0.28     60231
           1       0.18      0.89      0.29     12066

    accuracy                           0.29     72297
   macro avg       0.53      0.53     

In [None]:
labels = ['Positive', 'Negative', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']
mc_fin = pd.read_csv('./Result/zs_multiclass_fin-r1.csv')
for label in labels:
    mc_fin[label + '_Pred'] = mc_fin[label + '_Pred'].replace(2, 0)

for label in labels:
    print(label)
    print(cohen_kappa_score(mc_fin[label], mc_fin[label + '_Pred']))
    print(classification_report(mc_fin[label], mc_fin[label + '_Pred'], zero_division=0))

Positive
0.2932349725419099
              precision    recall  f1-score   support

           0       0.70      0.60      0.64     38908
           1       0.60      0.70      0.65     33389

    accuracy                           0.64     72297
   macro avg       0.65      0.65      0.64     72297
weighted avg       0.65      0.64      0.64     72297

Negative
0.12199040765257918
              precision    recall  f1-score   support

           0       0.41      0.93      0.57     26522
           1       0.85      0.23      0.36     45775

    accuracy                           0.48     72297
   macro avg       0.63      0.58      0.46     72297
weighted avg       0.69      0.48      0.43     72297

Uncertainty
0.15596942304241612
              precision    recall  f1-score   support

           0       0.87      0.79      0.82     60231
           1       0.27      0.40      0.32     12066

    accuracy                           0.72     72297
   macro avg       0.57      0.59      

In [None]:
labels = ['Positive', 'Negative', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']
mc_qwen = pd.read_csv('./Result/zs_multiclass_qwen.csv')
for label in labels:
    mc_qwen[label + '_Pred'] = mc_qwen[label + '_Pred'].replace(2, 1)

for label in labels:
    print(label)
    print(cohen_kappa_score(mc_qwen[label], mc_qwen[label + '_Pred']))
    print(classification_report(mc_qwen[label], mc_qwen[label + '_Pred'], zero_division=0))

Positive
0.25484814738791384
              precision    recall  f1-score   support

           0       0.73      0.46      0.57     38908
           1       0.56      0.80      0.66     33389

    accuracy                           0.62     72297
   macro avg       0.65      0.63      0.61     72297
weighted avg       0.65      0.62      0.61     72297

Negative
0.09284094909645924
              precision    recall  f1-score   support

           0       0.40      0.91      0.56     26522
           1       0.80      0.21      0.33     45775

    accuracy                           0.47     72297
   macro avg       0.60      0.56      0.44     72297
weighted avg       0.65      0.47      0.41     72297

Uncertainty
0.0784803049461178
              precision    recall  f1-score   support

           0       0.90      0.39      0.54     60231
           1       0.20      0.78      0.32     12066

    accuracy                           0.45     72297
   macro avg       0.55      0.58      

In [None]:
labels = ['Positive', 'Negative', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']
mc_qwen = pd.read_csv('./Result/zs_multiclass_qwen.csv')
for label in labels:
    mc_qwen[label + '_Pred'] = mc_qwen[label + '_Pred'].replace(2, 0)

for label in labels:
    print(label)
    print(cohen_kappa_score(mc_qwen[label], mc_qwen[label + '_Pred']))
    print(classification_report(mc_qwen[label], mc_qwen[label + '_Pred'], zero_division=0))

Positive
0.30086851038582274
              precision    recall  f1-score   support

           0       0.69      0.64      0.66     38908
           1       0.61      0.66      0.64     33389

    accuracy                           0.65     72297
   macro avg       0.65      0.65      0.65     72297
weighted avg       0.65      0.65      0.65     72297

Negative
0.047509694634182975
              precision    recall  f1-score   support

           0       0.38      0.99      0.55     26522
           1       0.91      0.08      0.14     45775

    accuracy                           0.41     72297
   macro avg       0.64      0.53      0.35     72297
weighted avg       0.71      0.41      0.29     72297

Uncertainty
0.13146633719569734
              precision    recall  f1-score   support

           0       0.87      0.70      0.78     60231
           1       0.24      0.48      0.32     12066

    accuracy                           0.67     72297
   macro avg       0.56      0.59    

In [None]:
ml_qwen = pd.read_csv('./Result/zs_multilabel_qwen.csv')
ml_qwen['Strong_Modal_Pred'] = ml_qwen['Strong Modal_Pred']
ml_qwen['Weak_Modal_Pred'] = ml_qwen['Weak Modal_Pred']
ml_qwen = ml_qwen.drop(columns=['Unnamed: 0', 'index', 'Strong Modal_Pred', 'Weak Modal_Pred'])
ml_qwen['Strong_Modal_Pred'] = ml_qwen['Strong_Modal_Pred'].fillna(0).astype(int)
ml_qwen['Weak_Modal_Pred'] = ml_qwen['Weak_Modal_Pred'].fillna(0).astype(int)
ml_qwen.to_csv('./Result/zs_multilabel_qwen.csv', index=False)

In [None]:
labels = ['Positive', 'Negative', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']
ml_qwen = pd.read_csv('./Result/zs_multilabel_qwen.csv')

for label in labels:
    print(label)
    print(cohen_kappa_score(ml_qwen[label], ml_qwen[label + '_Pred']))
    print(classification_report(ml_qwen[label], ml_qwen[label + '_Pred'], zero_division=0))

Positive
0.3600412035564151
              precision    recall  f1-score   support

           0       0.67      0.84      0.74     38908
           1       0.73      0.52      0.60     33389

    accuracy                           0.69     72297
   macro avg       0.70      0.68      0.67     72297
weighted avg       0.70      0.69      0.68     72297

Negative
0.10200101335204914
              precision    recall  f1-score   support

           0       0.40      0.95      0.57     26522
           1       0.87      0.18      0.30     45775

    accuracy                           0.46     72297
   macro avg       0.63      0.57      0.43     72297
weighted avg       0.70      0.46      0.39     72297

Uncertainty
0.1664291993585736
              precision    recall  f1-score   support

           0       0.86      0.83      0.85     60231
           1       0.29      0.35      0.32     12066

    accuracy                           0.75     72297
   macro avg       0.58      0.59      0

In [None]:
ml_fin = pd.read_csv('./Result/icl_multilabel_fin-r1.csv')
ml_fin['Strong_Modal_Pred'] = ml_fin['Strong Modal_Pred']
ml_fin['Weak_Modal_Pred'] = ml_fin['Weak Modal_Pred']
ml_fin = ml_fin.drop(columns=['Unnamed: 0', 'index', 'Strong Modal_Pred', 'Weak Modal_Pred'])
ml_fin['Strong_Modal_Pred'] = ml_fin['Strong_Modal_Pred'].fillna(0).astype(int)
ml_fin['Weak_Modal_Pred'] = ml_fin['Weak_Modal_Pred'].fillna(0).astype(int)
ml_fin.to_csv('./Result/icl_multilabel_fin-r1.csv', index=False)

In [None]:
labels = ['Positive', 'Negative', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']
ml_fin = pd.read_csv('./Result/icl_multilabel_fin-r1.csv')

for label in labels:
    print(label)
    print(cohen_kappa_score(ml_fin[label], ml_fin[label + '_Pred']))
    print(classification_report(ml_fin[label], ml_fin[label + '_Pred'], zero_division=0))

Positive
0.3429179132951464
              precision    recall  f1-score   support

           0       0.72      0.64      0.68     38908
           1       0.63      0.70      0.66     33389

    accuracy                           0.67     72297
   macro avg       0.67      0.67      0.67     72297
weighted avg       0.68      0.67      0.67     72297

Negative
0.09205831609887727
              precision    recall  f1-score   support

           0       0.40      0.97      0.56     26522
           1       0.90      0.15      0.25     45775

    accuracy                           0.45     72297
   macro avg       0.65      0.56      0.41     72297
weighted avg       0.72      0.45      0.37     72297

Uncertainty
0.20178214690498497
              precision    recall  f1-score   support

           0       0.87      0.84      0.86     60231
           1       0.32      0.37      0.34     12066

    accuracy                           0.77     72297
   macro avg       0.60      0.61      