In [1]:
from langdetect import detect, DetectorFactory
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import re
from io import BytesIO
from PyPDF2 import PdfReader
import fitz
from sklearn.metrics import classification_report, f1_score, cohen_kappa_score
DetectorFactory.seed = 0

# Extract Text From PDF

In [None]:
def open_clean_pdf(path, strict=False):
    data = open(path, 'rb').read()
    start = data.find(b'%PDF-')
    end   = data.rfind(b'%%EOF')
    if start < 0 or end < 0:
        raise ValueError("Not a valid PDF (no %PDF- or %%EOF)")
    trimmed = data[start:end + len(b'%%EOF')]
    return PdfReader(BytesIO(trimmed), strict=strict)

def extract_text(pdf_path, start_page=1, end_page=None):
    """
    Crops the top area of each page and extracts text from the cropped region.
    
    Parameters:
      pdf_path (str): Path to the PDF file.
      crop_top (int): Amount to crop from the top (in points).
      start_page (int): Start extraction from this page (1-indexed).
      end_page (int or None): End extraction at this page (1-indexed). If None, process until the end.
      
    Returns:
      str: Concatenated text extracted from the cropped pages.
    """
    reader = open_clean_pdf(pdf_path)
    lines = []

    if end_page is None:
        end_page = len(reader.pages)
    
    for i in range(start_page - 1, end_page):
        page = reader.pages[i]
        text = page.extract_text()
        if text:
            for line in text.splitlines():
                lines.append(line)
    
    return lines


def split_into_sentences(text):
    """
    Splits text into sentences. The regex is adjusted to avoid splitting
    on periods used within numbers (such as percentages or decimals).
    
    Parameters:
      text (str): The text to split.
      
    Returns:
      list: A list of individual sentences.
    """
    normalized_text = " ".join(text)
    sentence_pattern = re.compile(r'(?<=[.!?])(?!\s*\d)')
    sentences = sentence_pattern.split(normalized_text)
    sentences = [s.strip() for s in sentences if s.strip()]

    english_sentences = []
    for sentence in sentences:
        try:
            if detect(sentence) == "en":
                english_sentences.append(sentence)
        except Exception:
            continue

    return "\n".join(english_sentences)


In [4]:
page_df = pd.read_csv('Page.csv')
page_df['filename'] = page_df['ticker'] + '_' + page_df['year'].astype(str)
page_df = page_df[page_df['start'].notna()]

In [None]:
for filename in page_df['filename']:
    start_page = int(page_df[page_df['filename'] == filename]['start'].values[0])
    end_page = int(page_df[page_df['filename'] == filename]['end'].values[0])
    try:
        text = extract_text(f"AnnualReport/{filename}.pdf", start_page=start_page, end_page=end_page)
    except:
        print(f"AnnualReport/{filename}.pdf")
        continue
    sentences = split_into_sentences(text)
    with open(f"Text/{filename}.txt", "w") as text_file:
        text_file.write(sentences)

In [None]:
lines = []
for filename in page_df['filename']:
    try:
        line_count = 0
        with open(f"Text/{filename}.txt", "r") as file:
            for line in file:
                line_count += 1
        lines.append(line_count)
    except:
        continue

# Create PseudoLabel from L&M Dictionary

In [None]:
lm_dict = pd.read_csv('LM_Dict_2024.csv')
lm_dict = lm_dict[(lm_dict['Positive'] > 0) | (lm_dict['Negative'] > 0) | (lm_dict['Uncertainty'] > 0) | (lm_dict['Litigious'] > 0) | (lm_dict['Strong_Modal'] > 0) | (lm_dict['Weak_Modal'] > 0) | (lm_dict['Constraining'] > 0)]
lm_dict = lm_dict[['Word', 'Positive', 'Negative', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']]
lm_dict = lm_dict.melt(id_vars=['Word'], var_name='Label', value_name='Value')
lm_dict = lm_dict[lm_dict['Value'] > 0]
lm_dict = lm_dict[['Word', 'Label']]
lm_dict = lm_dict.drop_duplicates()
lm_dict = lm_dict.reset_index(drop=True)
lm_dict.to_csv('LM_Sentiment_2024.csv', index=False)

In [2]:
lm_dict = pd.read_csv('../LM_Sentiment_2024.csv')

In [5]:
from tqdm import tqdm

# Get unique labels from lm_dict
unique_labels = lm_dict['Label'].unique().tolist()

# Create an empty dataframe with columns: Sentence, Ticker, Year, plus all unique labels
all_columns = ['Sentence', 'Ticker', 'Year'] + unique_labels
text_df = pd.DataFrame(columns=all_columns)

# Iterate over the Text files with a progress bar
for filename in tqdm(page_df['filename'], desc="Processing files"):
    try:
        with open(f"Text/{filename}.txt", "r") as file:
            for line in file:
                sentence = line.strip()
                if sentence:
                    # Initialize the row with default values (0 for each label)
                    row_data = {
                        "Sentence": sentence,
                        "Ticker": filename.split('_')[0],
                        "Year": filename.split('_')[1]
                    }
                    for label in unique_labels:
                        row_data[label] = 0

                    # For each label, if a word is found, mark the label and then continue to the next label.
                    for label in unique_labels:
                        words = lm_dict[lm_dict['Label'] == label]['Word']
                        for word in words:
                            if word in sentence.upper():
                                row_data[label] = 1
                                break  # Found a word for this label, continue to next label

                    # Use pd.concat to add the row to the DataFrame
                    new_row = pd.DataFrame([row_data])
                    text_df = pd.concat([text_df, new_row], ignore_index=True)
    except Exception as e:
        print(f"Error processing file Text/{filename}.txt: {e}")
        continue

Processing files: 100%|██████████| 20/20 [00:17<00:00,  1.12it/s]


In [6]:
text_df.to_csv('Text_Database.csv', index=False, escapechar='\\')

# Evaluate Classification

In [7]:
text_df = pd.read_csv('Text_Database.csv')

## ZS

In [12]:
labels = ['Positive', 'Negative', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']
mc_fin = pd.read_csv('./Result/zs_multiclass_fin-r1.csv')
for label in labels:
    mc_fin[label + '_Pred'] = mc_fin[label + '_Pred'].replace(2, 1)

for label in labels:
    print(label)
    print(cohen_kappa_score(mc_fin[label], mc_fin[label + '_Pred']))
    print(classification_report(mc_fin[label], mc_fin[label + '_Pred'], zero_division=0))

Positive
0.13296595693825708
              precision    recall  f1-score   support

           0       0.76      0.52      0.62      1675
           1       0.38      0.64      0.47       759

    accuracy                           0.56      2434
   macro avg       0.57      0.58      0.55      2434
weighted avg       0.64      0.56      0.57      2434

Negative
0.1714059477665747
              precision    recall  f1-score   support

           0       0.38      0.64      0.47       703
           1       0.79      0.57      0.66      1731

    accuracy                           0.59      2434
   macro avg       0.59      0.60      0.57      2434
weighted avg       0.67      0.59      0.61      2434

Uncertainty
0.04570851724299341
              precision    recall  f1-score   support

           0       0.91      0.22      0.35      2016
           1       0.19      0.90      0.32       418

    accuracy                           0.33      2434
   macro avg       0.55      0.56      

In [13]:
labels = ['Positive', 'Negative', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']
mc_fin = pd.read_csv('./Result/zs_multiclass_fin-r1.csv')
for label in labels:
    mc_fin[label + '_Pred'] = mc_fin[label + '_Pred'].replace(2, 0)

for label in labels:
    print(label)
    print(cohen_kappa_score(mc_fin[label], mc_fin[label + '_Pred']))
    print(classification_report(mc_fin[label], mc_fin[label + '_Pred'], zero_division=0))

Positive
0.15344452652872043
              precision    recall  f1-score   support

           0       0.74      0.67      0.71      1675
           1       0.40      0.49      0.44       759

    accuracy                           0.62      2434
   macro avg       0.57      0.58      0.57      2434
weighted avg       0.64      0.62      0.62      2434

Negative
0.14338066068456257
              precision    recall  f1-score   support

           0       0.35      0.86      0.50       703
           1       0.86      0.35      0.49      1731

    accuracy                           0.49      2434
   macro avg       0.60      0.60      0.49      2434
weighted avg       0.71      0.49      0.49      2434

Uncertainty
0.23202987395995156
              precision    recall  f1-score   support

           0       0.89      0.76      0.82      2016
           1       0.32      0.54      0.40       418

    accuracy                           0.72      2434
   macro avg       0.60      0.65     

In [14]:
labels = ['Positive', 'Negative', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']
mc_qwen = pd.read_csv('./Result/zs_multiclass_qwen.csv')
for label in labels:
    mc_qwen[label + '_Pred'] = mc_qwen[label + '_Pred'].replace(2, 1)

for label in labels:
    print(label)
    print(cohen_kappa_score(mc_qwen[label], mc_qwen[label + '_Pred']))
    print(classification_report(mc_qwen[label], mc_qwen[label + '_Pred'], zero_division=0))

Positive
0.1356186381157447
              precision    recall  f1-score   support

           0       0.76      0.52      0.62      1675
           1       0.38      0.64      0.48       759

    accuracy                           0.56      2434
   macro avg       0.57      0.58      0.55      2434
weighted avg       0.64      0.56      0.57      2434

Negative
0.10687810055122793
              precision    recall  f1-score   support

           0       0.33      0.83      0.48       703
           1       0.83      0.32      0.46      1731

    accuracy                           0.47      2434
   macro avg       0.58      0.58      0.47      2434
weighted avg       0.68      0.47      0.47      2434

Uncertainty
0.05069608223382294
              precision    recall  f1-score   support

           0       0.92      0.22      0.35      2016
           1       0.19      0.91      0.32       418

    accuracy                           0.33      2434
   macro avg       0.56      0.56      

In [15]:
labels = ['Positive', 'Negative', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']
mc_qwen = pd.read_csv('./Result/zs_multiclass_qwen.csv')
for label in labels:
    mc_qwen[label + '_Pred'] = mc_qwen[label + '_Pred'].replace(2, 0)

for label in labels:
    print(label)
    print(cohen_kappa_score(mc_qwen[label], mc_qwen[label + '_Pred']))
    print(classification_report(mc_qwen[label], mc_qwen[label + '_Pred'], zero_division=0))

Positive
0.17904467616187558
              precision    recall  f1-score   support

           0       0.74      0.74      0.74      1675
           1       0.43      0.44      0.44       759

    accuracy                           0.65      2434
   macro avg       0.59      0.59      0.59      2434
weighted avg       0.65      0.65      0.65      2434

Negative
0.05803209807238463
              precision    recall  f1-score   support

           0       0.31      0.98      0.47       703
           1       0.94      0.11      0.20      1731

    accuracy                           0.36      2434
   macro avg       0.62      0.55      0.34      2434
weighted avg       0.76      0.36      0.28      2434

Uncertainty
0.14380118572095035
              precision    recall  f1-score   support

           0       0.90      0.55      0.68      2016
           1       0.24      0.70      0.36       418

    accuracy                           0.58      2434
   macro avg       0.57      0.63     

In [16]:
labels = ['Positive', 'Negative', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']
ml_qwen = pd.read_csv('./Result/zs_multilabel_qwen.csv')

for label in labels:
    print(label)
    print(cohen_kappa_score(ml_qwen[label], ml_qwen[label + '_Pred']))
    print(classification_report(ml_qwen[label], ml_qwen[label + '_Pred'], zero_division=0))

Positive
0.3259275499579708
              precision    recall  f1-score   support

           0       0.76      0.91      0.83      1675
           1       0.65      0.38      0.48       759

    accuracy                           0.74      2434
   macro avg       0.71      0.64      0.66      2434
weighted avg       0.73      0.74      0.72      2434

Negative
0.14358813683422156
              precision    recall  f1-score   support

           0       0.35      0.90      0.50       703
           1       0.88      0.31      0.46      1731

    accuracy                           0.48      2434
   macro avg       0.61      0.61      0.48      2434
weighted avg       0.73      0.48      0.47      2434

Uncertainty
0.3133262933399592
              precision    recall  f1-score   support

           0       0.92      0.74      0.82      2016
           1       0.36      0.69      0.47       418

    accuracy                           0.73      2434
   macro avg       0.64      0.72      0

In [17]:
labels = ['Positive', 'Negative', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']
ml_fin = pd.read_csv('./Result/zs_multilabel_fin-r1.csv')

for label in labels:
    print(label)
    print(cohen_kappa_score(ml_fin[label], ml_fin[label + '_Pred']))
    print(classification_report(ml_fin[label], ml_fin[label + '_Pred'], zero_division=0))

Positive
0.2838471974611583
              precision    recall  f1-score   support

           0       0.78      0.78      0.78      1675
           1       0.51      0.50      0.50       759

    accuracy                           0.69      2434
   macro avg       0.64      0.64      0.64      2434
weighted avg       0.69      0.69      0.69      2434

Negative
0.17037175997945098
              precision    recall  f1-score   support

           0       0.36      0.87      0.51       703
           1       0.88      0.37      0.52      1731

    accuracy                           0.51      2434
   macro avg       0.62      0.62      0.51      2434
weighted avg       0.73      0.51      0.52      2434

Uncertainty
0.29797406313377295
              precision    recall  f1-score   support

           0       0.93      0.69      0.79      2016
           1       0.34      0.76      0.47       418

    accuracy                           0.70      2434
   macro avg       0.63      0.72      

## ICL

In [18]:
labels = ['Positive', 'Negative', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']
mc_fin = pd.read_csv('./Result/icl_multiclass_fin-r1.csv')
for label in labels:
    mc_fin[label + '_Pred'] = mc_fin[label + '_Pred'].replace(2, 1)

for label in labels:
    print(label)
    print(cohen_kappa_score(mc_fin[label], mc_fin[label + '_Pred']))
    print(classification_report(mc_fin[label], mc_fin[label + '_Pred'], zero_division=0))

Positive
0.15854537608677532
              precision    recall  f1-score   support

           0       0.76      0.60      0.67      1675
           1       0.40      0.57      0.47       759

    accuracy                           0.59      2434
   macro avg       0.58      0.59      0.57      2434
weighted avg       0.65      0.59      0.61      2434

Negative
0.08953016180156592
              precision    recall  f1-score   support

           0       0.33      0.81      0.46       703
           1       0.81      0.32      0.46      1731

    accuracy                           0.46      2434
   macro avg       0.57      0.56      0.46      2434
weighted avg       0.67      0.46      0.46      2434

Uncertainty
0.07539556176272122
              precision    recall  f1-score   support

           0       0.92      0.29      0.44      2016
           1       0.21      0.89      0.33       418

    accuracy                           0.39      2434
   macro avg       0.57      0.59     

In [19]:
labels = ['Positive', 'Negative', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']
mc_fin = pd.read_csv('./Result/icl_multiclass_fin-r1.csv')
for label in labels:
    mc_fin[label + '_Pred'] = mc_fin[label + '_Pred'].replace(2, 0)

for label in labels:
    print(label)
    print(cohen_kappa_score(mc_fin[label], mc_fin[label + '_Pred']))
    print(classification_report(mc_fin[label], mc_fin[label + '_Pred'], zero_division=0))

Positive
0.1807965713864066
              precision    recall  f1-score   support

           0       0.75      0.70      0.72      1675
           1       0.42      0.49      0.46       759

    accuracy                           0.63      2434
   macro avg       0.59      0.59      0.59      2434
weighted avg       0.65      0.63      0.64      2434

Negative
0.09822624789094514
              precision    recall  f1-score   support

           0       0.33      0.97      0.49       703
           1       0.94      0.18      0.31      1731

    accuracy                           0.41      2434
   macro avg       0.63      0.58      0.40      2434
weighted avg       0.77      0.41      0.36      2434

Uncertainty
0.24397592950448566
              precision    recall  f1-score   support

           0       0.92      0.67      0.77      2016
           1       0.30      0.70      0.43       418

    accuracy                           0.67      2434
   macro avg       0.61      0.69      

In [20]:
labels = ['Positive', 'Negative', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']
mc_fin = pd.read_csv('./Result/icl_multiclass_qwen.csv')
for label in labels:
    mc_fin[label + '_Pred'] = mc_fin[label + '_Pred'].replace(2, 1)

for label in labels:
    print(label)
    print(cohen_kappa_score(mc_fin[label], mc_fin[label + '_Pred']))
    print(classification_report(mc_fin[label], mc_fin[label + '_Pred'], zero_division=0))

Positive
0.23210326855995478
              precision    recall  f1-score   support

           0       0.76      0.75      0.76      1675
           1       0.47      0.49      0.48       759

    accuracy                           0.67      2434
   macro avg       0.61      0.62      0.62      2434
weighted avg       0.67      0.67      0.67      2434

Negative
0.08118842237275337
              precision    recall  f1-score   support

           0       0.32      0.93      0.48       703
           1       0.88      0.19      0.32      1731

    accuracy                           0.41      2434
   macro avg       0.60      0.56      0.40      2434
weighted avg       0.72      0.41      0.36      2434

Uncertainty
0.051830497530523156
              precision    recall  f1-score   support

           0       0.91      0.25      0.39      2016
           1       0.19      0.88      0.32       418

    accuracy                           0.35      2434
   macro avg       0.55      0.56    

In [21]:
labels = ['Positive', 'Negative', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']
mc_fin = pd.read_csv('./Result/icl_multiclass_qwen.csv')
for label in labels:
    mc_fin[label + '_Pred'] = mc_fin[label + '_Pred'].replace(2, 0)

for label in labels:
    print(label)
    print(cohen_kappa_score(mc_fin[label], mc_fin[label + '_Pred']))
    print(classification_report(mc_fin[label], mc_fin[label + '_Pred'], zero_division=0))

Positive
0.2194936121723302
              precision    recall  f1-score   support

           0       0.75      0.82      0.78      1675
           1       0.49      0.38      0.43       759

    accuracy                           0.69      2434
   macro avg       0.62      0.60      0.61      2434
weighted avg       0.67      0.69      0.67      2434

Negative
0.05642103403071852
              precision    recall  f1-score   support

           0       0.31      0.99      0.47       703
           1       0.97      0.10      0.18      1731

    accuracy                           0.36      2434
   macro avg       0.64      0.55      0.33      2434
weighted avg       0.78      0.36      0.27      2434

Uncertainty
0.15648825567963032
              precision    recall  f1-score   support

           0       0.91      0.55      0.68      2016
           1       0.25      0.73      0.37       418

    accuracy                           0.58      2434
   macro avg       0.58      0.64      

In [22]:
labels = ['Positive', 'Negative', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']
mc_fin = pd.read_csv('./Result/icl_multilabel_fin-r1.csv')
for label in labels:
    mc_fin[label + '_Pred'] = mc_fin[label + '_Pred'].replace(2, 0)

for label in labels:
    print(label)
    print(cohen_kappa_score(mc_fin[label], mc_fin[label + '_Pred']))
    print(classification_report(mc_fin[label], mc_fin[label + '_Pred'], zero_division=0))

Positive
0.24382128876023323
              precision    recall  f1-score   support

           0       0.77      0.74      0.75      1675
           1       0.47      0.51      0.49       759

    accuracy                           0.67      2434
   macro avg       0.62      0.62      0.62      2434
weighted avg       0.68      0.67      0.67      2434

Negative
0.15175714228794468
              precision    recall  f1-score   support

           0       0.35      0.94      0.51       703
           1       0.93      0.29      0.44      1731

    accuracy                           0.48      2434
   macro avg       0.64      0.61      0.47      2434
weighted avg       0.76      0.48      0.46      2434

Uncertainty
0.42299117725980573
              precision    recall  f1-score   support

           0       0.92      0.84      0.88      2016
           1       0.46      0.66      0.54       418

    accuracy                           0.81      2434
   macro avg       0.69      0.75     

In [23]:
labels = ['Positive', 'Negative', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']
mc_fin = pd.read_csv('./Result/icl_multilabel_qwen.csv')
for label in labels:
    mc_fin[label + '_Pred'] = mc_fin[label + '_Pred'].replace(2, 0)

for label in labels:
    print(label)
    print(cohen_kappa_score(mc_fin[label], mc_fin[label + '_Pred']))
    print(classification_report(mc_fin[label], mc_fin[label + '_Pred'], zero_division=0))

Positive
0.28311539501236627
              precision    recall  f1-score   support

           0       0.77      0.80      0.79      1675
           1       0.52      0.47      0.50       759

    accuracy                           0.70      2434
   macro avg       0.65      0.64      0.64      2434
weighted avg       0.69      0.70      0.70      2434

Negative
0.11512220467401779
              precision    recall  f1-score   support

           0       0.33      0.96      0.49       703
           1       0.93      0.22      0.36      1731

    accuracy                           0.43      2434
   macro avg       0.63      0.59      0.43      2434
weighted avg       0.76      0.43      0.40      2434

Uncertainty
0.45538945815087595
              precision    recall  f1-score   support

           0       0.92      0.88      0.90      2016
           1       0.51      0.62      0.56       418

    accuracy                           0.83      2434
   macro avg       0.71      0.75     