In [2]:
import os
import pandas as pd

In [3]:
# Get the parent directory (one node above)
directory = os.path.dirname(os.getcwd())
print("Parent Directory:", directory)
os.listdir(directory)
# directory for original_loan_contracts
directory_loan = directory + '/Data/LoansFull'
print("Loan Directory:", directory_loan)

Parent Directory: /Users/zrsong/Dropbox (MIT)/Research Projects/Direct Lending
Loan Directory: /Users/zrsong/Dropbox (MIT)/Research Projects/Direct Lending/Data/LoansFull


# Extraction of Information Covenants

In [4]:
def search_after_phrase(content, phrase, offset, statement_keyword, fiscal_month_keyword, fiscal_year_keyword, projected_keyword, borrower_keyword):
    content = re.sub(r'\s+', ' ', content).strip()

    occurrences = [i for i in range(len(content)) if content.startswith(phrase, i)]
    monthly_fs_sentences = []
    projected_fs_sentences = []

    for start_index in occurrences:
        start_index += len(phrase)
        end_index = start_index + offset
        snippet = content[start_index:end_index]

        sentences = re.split(r'(?<=[.;]) +', snippet)

        for sentence in sentences:
            if (any(keyword in sentence for keyword in borrower_keyword) and any(keyword in sentence for keyword in fiscal_month_keyword) and any(keyword in sentence for keyword in statement_keyword)):
                monthly_fs_sentences.append(sentence)

            if (any(keyword in sentence for keyword in projected_keyword) and any(keyword in sentence for keyword in fiscal_year_keyword) and any(keyword in sentence for keyword in borrower_keyword)):
                projected_fs_sentences.append(sentence)

    return monthly_fs_sentences, projected_fs_sentences

import re

def search_after_phrase_meeting(content, phrase, offset, meeting_keyword):
    # Normalize whitespace in the content
    content = re.sub(r'\s+', ' ', content).strip()
    
    # Find all occurrences of the phrase in the content
    occurrences = [i for i in range(len(content)) if content.startswith(phrase, i)]
    
    # Initialize lists to store the extracted sentences
    lender_meeting_sentences = []

    # Loop through each occurrence of the phrase
    for start_index in occurrences:
        # Move the start index to just after the phrase
        start_index += len(phrase)
        
        # Define the range to extract from (start_index to start_index + offset)
        end_index = start_index + offset
        snippet = content[start_index:end_index]

        # Split the snippet into sentences
        sentences = re.split(r'(?<=[.;]) +', snippet)
        
        # Search each sentence for keywords and extract only sentence
        for sentence in sentences:
            if (any(keyword in sentence for keyword in meeting_keyword)):
                lender_meeting_sentences.append(sentence)

    return lender_meeting_sentences

# Sample keywords
statement_keyword = [
    "balance sheet", "income statement", "cash flow", "certificate", 
    "statement of operations", "financial statement", "financial report"
]

fiscal_month_keyword = [
    "fiscal month", "monthly", "calendar month"
]

fiscal_year_keyword = [
    "fiscal year", "annually", "annual", "yearly", "calendar year"
]

projected_keyword = [
    "forecast", "operating plan", "business plan", "projection", "budget", "projected", "budgeted"
]

borrower_keyword = [
    "borrower", "borrowing", "agent", "lender", "parent", "holdings", "guarantor", "collateral property"
]

meeting_keyword = [
    "lender meeting", "lender call", "conference call", "conference meeting"
]     

In [5]:
# Example DataFrame
contracts_name = 'combined_loancontracts.csv'
contracts_path = os.path.join(directory_loan, contracts_name)
df = pd.read_csv(contracts_path)  # Load your DataFrame, replace with your actual file path

# Initialize results list
results = []

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    content = row['text'].lower()  # Convert content to lower case      
    
    monthly_fs_sentences, projected_fs_sentences = search_after_phrase(
        content, 'covenants', 10000, statement_keyword, fiscal_month_keyword, fiscal_year_keyword, projected_keyword, borrower_keyword
    )

    lender_meeting_sentences = search_after_phrase_meeting(
        content, 'covenants', 50000, meeting_keyword
    )
    
    # Append results to the list
    for sentence in monthly_fs_sentences:
        results.append({
            'accession': row['accession'],
            'monthly_fs': 1,
            'monthly_context': sentence,
            'projected_fs': 0,
            'projected_context': '',
            'lender_meeting': 0,
            'lender_meeting_context': ''
        })

    for sentence in projected_fs_sentences:
        results.append({
            'accession': row['accession'],
            'monthly_fs': 0,
            'monthly_context': '',
            'projected_fs': 1,
            'projected_context': sentence,
            'lender_meeting': 0,
            'lender_meeting_context': ''
        })

    for sentence in lender_meeting_sentences:
        results.append({
            'accession': row['accession'],
            'monthly_fs': 0,
            'monthly_context': '',
            'projected_fs': 0,
            'projected_context': '',
            'lender_meeting': 1,
            'lender_meeting_context': sentence
        })

# Convert results list to DataFrame and drop duplicates
df_results = pd.DataFrame(results)
# Save the results to a CSV file
results_path = os.path.join(directory_loan, 'combined_loancontracts_info_cov.csv')
df_results.to_csv(results_path, index=False)

# Extraction of Whether ABL Loan

In [4]:
# Example usage: Processing a DataFrame with text data
df = pd.read_csv('../Data/LoansFull/combined_loancontracts_mm.csv')  # Load your CSV file
list = pd.read_csv('../Data/Intermediate/final_regression_sample_filings.csv')
# merge list with df using accession type_filing and type_attachment and keep only text column from df
df = df.merge(list, on=['accession', 'type_filing', 'type_attachment'], how='inner')
# keep only the text column and accession, type_filing, type_attachment
df = df[['accession', 'type_filing', 'type_attachment', 'text']]

In [5]:
# change all text to lower case
df['text'] = df['text'].str.lower()

In [8]:
def search_abl(text):
    search_phrases = ["abl ", "asset-based", "asset based"]
    content = text[:500].lower()
    
    for phrase in search_phrases:
        if phrase in content:
            return True
    
    return False

def search_secondlien(text):
    search_phrases = ["second lien", "second-lien"]
    content = text[:500].lower()
    
    for phrase in search_phrases:
        if phrase in content:
            return True
    
    return False

df['asset_based'] = df['text'].apply(search_abl).astype(int)
df['second_lien'] = df['text'].apply(search_secondlien).astype(int)

# save to intermediate folder and drop text column
df.drop(columns=['text']).to_csv('../Data/Intermediate/abl_secondlien.csv', index=False)
