In [None]:
## SET UP AND DATA LOAD

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

df = pd.read_csv('df_finalized.csv')

# Make sure column names are clean
df.columns = df.columns.str.strip()

# Convert mostImportantDateUTC to datetime
df['mostImportantDateUTC'] = pd.to_datetime(df['mostImportantDateUTC'])

# Load the GVKEY to ticker crosswalk
gvkey_ticker = pd.read_csv('merged_wrds_gvkey_V2.txt', sep='\t')

In [None]:
## CALCULATE FINANCIAL METRIC CHANGES BETWEEN QUARTERS

df_change = df.sort_values(by=['gvkey', 'mostImportantDateUTC'])
metrics = ['eps', 'pe_ratio', 'ev_ebitda', 'revenue']

for metric in metrics:
    df_change[f'{metric}_change'] = df.groupby('gvkey')[metric].pct_change()

print(df_change.head())

In [None]:
## FIXED EFFECTS TEST


# Columns needed for regression
fundamentals = ['eps_change', 'pe_ratio_change', 'ev_ebitda_change', 'revenue_change']
sentiment_vars = ['total_sentiment_score', 'average_sentiment_score', 'sentiment_ratio', 'volatility']

# Only drop rows with NaNs in variables you will use
all_vars = sentiment_vars + fundamentals + ['Price_Pct_Change']
df_clean = df_change.dropna(subset=all_vars)

# Make sure index columns exist
if 'gvkey' in df_clean.columns and 'mostImportantDateUTC' in df_clean.columns:
    df_clean = df_clean.set_index(['gvkey', 'mostImportantDateUTC'])


# Loop through each fundamental variable
for dep_var in fundamentals:
    print(f"=== Fixed Effects Panel Regression for {dep_var} ===")
    
    # Build formula
    formula = f"{dep_var} ~ {' + '.join(indep_vars)} + EntityEffects + TimeEffects"
    
    # Fit model
    model = PanelOLS.from_formula(formula, data=df_change_clean)
    results = model.fit(cov_type='clustered', cluster_entity=True)
    
    # Print summary
    print(results.summary)
    print("\n" + "="*100 + "\n")

In [None]:
## CORRELATIONS BETWEEN FINANCIALS AND SENTIMENT

sentiment_cols = ['total_sentiment_score', 'average_sentiment_score', 'sentiment_ratio', 'volatility']
financial_changes = ['Price_Pct_Change', 'eps_change', 'pe_ratio_change', 'ev_ebitda_change', 'revenue_change' ]
df_clean = df.dropna(subset=financial_changes)

plt.figure(figsize=(10,8))
sns.heatmap(df_clean[sentiment_cols + financial_changes].corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation between Sentiment and Financial Metrics")
plt.show()

for col in sentiment_cols:
    for target in financial_changes:
        sns.lmplot(x=col, y=target, data=df, height=5)

In [None]:
## LOGISITIC REGRESSION

import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import os

# --- 1. Define outcome variables ---
df['EPS_up'] = (df['eps_change'] > 0).astype(int)
df['Revenue_up'] = (df['revenue_change'] > 0).astype(int)
df['PE_up'] = (df['pe_ratio_change'] > 0).astype(int)
df['EV_EBITDA_up'] = (df['ev_ebitda_change'] > 0).astype(int)
df['Price_up'] = (df['Price_Pct_Change'] > 0).astype(int)

# --- 2. Define features (avoid perfect collinearity) ---
features = ['total_sentiment_score', 'average_sentiment_score', 'volatility']  
# Remove 'sentiment_ratio' if it's derived from other features

# --- 3. Prepare results storage ---
results = {}

# --- 4. Loop through each outcome ---
for outcome in ['EPS_up', 'Revenue_up', 'PE_up', 'EV_EBITDA_up', 'Price_up']:
    # Drop missing values for this outcome
    df_clean = df.dropna(subset=features + [outcome])
    
    X = df_clean[features]
    y = df_clean[outcome]
    
    # Add constant for intercept
    X = sm.add_constant(X)
    
    # Fit logistic regression
    logit_model = sm.Logit(y, X).fit(disp=0)
    
    # Predictions
    y_pred_prob = logit_model.predict(X)
    y_pred = (y_pred_prob >= 0.5).astype(int)
    
    # Metrics
    cm = confusion_matrix(y, y_pred)
    roc_auc = roc_auc_score(y, y_pred_prob)
    
    # Coefficients and odds ratios
    coeffs = logit_model.params
    odds_ratios = np.exp(coeffs)
    
    # Store results
    results[outcome] = {
        'features': X.columns.tolist(),
        'coeffs': coeffs.values,
        'odds_ratios': odds_ratios.values,
        'y_true': y.values,
        'y_pred': y_pred,
        'probs': y_pred_prob
    }

# Create a folder to save plots
plot_dir = "logistic_plots"
os.makedirs(plot_dir, exist_ok=True)

def visualize_logistic_results(model_name, data):
    features = data['features']
    coeffs = data['coeffs']
    odds_ratios = data['odds_ratios']
    y_true = data['y_true']
    y_pred = data['y_pred']
    probs = data['probs']

    # --- Feature coefficients ---
    plt.figure(figsize=(7,4))
    sns.barplot(x=features, y=coeffs, palette='coolwarm')
    plt.title(f"{model_name} - Feature Coefficients")
    plt.ylabel("Coefficient Value")
    plt.xticks(rotation=30)
    plt.tight_layout()
    plt.savefig(os.path.join(plot_dir, f"{model_name}_coefficients.png"))
    plt.close()

    # --- Odds ratios ---
    plt.figure(figsize=(7,4))
    sns.barplot(x=features, y=odds_ratios, palette='viridis')
    plt.title(f"{model_name} - Feature Odds Ratios")
    plt.ylabel("Odds Ratio")
    plt.xticks(rotation=30)
    plt.tight_layout()
    plt.savefig(os.path.join(plot_dir, f"{model_name}_odds_ratios.png"))
    plt.close()

    # --- Confusion Matrix ---
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"{model_name} - Confusion Matrix")
    plt.tight_layout()
    plt.savefig(os.path.join(plot_dir, f"{model_name}_confusion_matrix.png"))
    plt.close()

    # --- ROC Curve ---
    fpr, tpr, _ = roc_curve(y_true, probs)
    roc_auc = roc_auc_score(y_true, probs)
    plt.figure(figsize=(5,5))
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.3f})', color='darkorange', lw=2)
    plt.plot([0,1], [0,1], 'k--', lw=1)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"{model_name} - ROC Curve")
    plt.legend(loc="lower right")
    plt.tight_layout()
    plt.savefig(os.path.join(plot_dir, f"{model_name}_roc_curve.png"))
    plt.close()

# --- Visualize and save all models ---
for model_name, data in results.items():
    visualize_logistic_results(model_name, data)

print(f"All plots saved to folder: {plot_dir}")

In [None]:
## LOGISTIC REGRESSION

from sklearn.metrics import classification_report, roc_auc_score
import pandas as pd

# Loop through each model and print a summary
for model_name, data in results.items():
    print(f"\n=== {model_name} ===")
    
    # Convert features, coefficients, and odds ratios into a DataFrame
    if 'odds_ratios' in data:
        coeffs_df = pd.DataFrame({
            'Feature': data['features'],
            'Coefficient': data['coeffs'],
            'Odds_Ratio': data['odds_ratios']
        })
    else:
        coeffs_df = pd.DataFrame({
            'Feature': data['features'],
            'Coefficient': data['coeffs']
        })
    
    print("\nFeature Coefficients and Odds Ratios:")
    print(coeffs_df.to_string(index=False))
    
    # Confusion matrix
    if 'confusion_matrix' in data:
        print("\nConfusion Matrix:")
        print(data['confusion_matrix'])
    
    # Classification metrics
    y_true = data['y_true']
    y_pred = data['y_pred']
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, digits=4))
    
    # ROC AUC
    if 'probs' in data:
        roc_auc = roc_auc_score(y_true, data['probs'])
        print(f"ROC AUC: {roc_auc:.4f}")
    
    print("-" * 50)


In [None]:
## EVENT STUDY - PULLING ALL STOCK DATA FOR EACH TICKER 2017-2022
import yfinance as yf
import pandas as pd
import statsmodels.api as sm
import os

# --- Load transcript data ---
df_transcripts = pd.read_csv('df_finalized.csv')
df_transcripts['mostImportantDateUTC'] = pd.to_datetime(df_transcripts['mostImportantDateUTC'])

# --- Load GVKEY to ticker crosswalk ---
gvkey_ticker = pd.read_csv('merged_wrds_gvkey_V2.txt', sep='\t')
df_transcripts = df_transcripts.merge(gvkey_ticker[['gvkey', 'tic']], on='gvkey', how='left')
df_transcripts.rename(columns={'tic': 'ticker'}, inplace=True)
df_transcripts['ticker'] = df_transcripts['ticker'].astype(str).str.upper().str.strip()

import os
import pandas as pd
import yfinance as yf
from concurrent.futures import ThreadPoolExecutor, as_completed

# Directory to save stock data
os.makedirs("stock_data", exist_ok=True)

def get_stock_data(ticker, start="2017-01-01", end="2022-12-31"):
    """
    Downloads and saves stock data for a single ticker.
    Returns True if successful, False otherwise.
    """
    filename = f"stock_data/{ticker}.csv"
    
    # Skip if already downloaded
    if os.path.exists(filename):
        return f"{ticker}: already exists"

    try:
        stock = yf.download(ticker, start=start, end=end, progress=False)
        if stock.empty:
            return f"{ticker}: no data"
        
        stock.to_csv(filename)
        return f"{ticker}: downloaded"
    except Exception as e:
        return f"{ticker}: failed ({e})"


def download_all_tickers(ticker_list, start="2017-01-01", end="2022-12-31", max_workers=10):
    """
    Downloads stock data for many tickers in parallel.
    max_workers controls how many tickers are pulled simultaneously.
    """
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(get_stock_data, t, start, end): t for t in ticker_list}
        
        for future in as_completed(futures):
            results.append(future.result())
    
    return results


# Example usage
results = download_all_tickers(
    tickers, 
    start="2017-01-01", 
    end="2022-12-31", 
    max_workers=8
)
# Save a log of results
pd.DataFrame(results, columns=["status"]).to_csv("download_log.csv", index=False)

print("Done. Check download_log.csv for details.")

In [None]:
## EVENT STUDY - MATCH EVENT WINDOW STOCK PRICES TO TRANSCRIPTS, COMPUTE RETURNS, CALCULATE CAR

import os
import pandas as pd
import yfinance as yf
import statsmodels.api as sm

# Make sure folder exists
os.makedirs("stock_data", exist_ok=True)

# --- Load transcript data ---
df_transcripts = pd.read_csv('df_finalized.csv')
df_transcripts['mostImportantDateUTC'] = pd.to_datetime(df_transcripts['mostImportantDateUTC'])

# --- Load GVKEY to ticker crosswalk ---
gvkey_ticker = pd.read_csv('merged_wrds_gvkey_V2.txt', sep='\t')
df_transcripts = df_transcripts.merge(gvkey_ticker[['gvkey', 'tic']], on='gvkey', how='left')
df_transcripts.rename(columns={'tic': 'ticker'}, inplace=True)
df_transcripts['ticker'] = df_transcripts['ticker'].astype(str).str.upper().str.strip()

def get_stock_data(ticker, start, end, folder="stock_data"):
    """
    Load stock CSV, ensure datetime index and numeric Close, filter by date range.
    """
    filename = f"{folder}/{ticker}.csv"
    
    try:
        stock = pd.read_csv(filename, index_col=0)
    except FileNotFoundError:
        print(f"File not found: {filename}")
        return pd.DataFrame()
    
    # Ensure datetime index
    stock.index = pd.to_datetime(stock.index, errors='coerce')
    stock = stock[stock.index.notna()]  # drop rows with bad dates
    
    # Ensure numeric 'Close'
    if 'Close' not in stock.columns:
        print(f"No 'Close' column in {ticker}")
        return pd.DataFrame()
    stock['Close'] = pd.to_numeric(stock['Close'], errors='coerce')
    stock = stock.dropna(subset=['Close'])
    
    # Filter by requested date range
    stock = stock[(stock.index >= start) & (stock.index <= end)]
    
    return stock

def event_study_single(ticker, event_date, benchmark_file="stock_data/GSPC.csv",
                       estimation_window=60, event_window=3):
    """
    Compute cumulative abnormal return (CAR) for a single event.
    
    ticker: stock ticker string
    event_date: pd.Timestamp
    benchmark_file: CSV file of benchmark (market) prices
    estimation_window: days before event for estimating beta
    event_window: days after/before event to calculate CAR
    """
    
    # Define start/end of the event + estimation window
    start = event_date - pd.Timedelta(days=estimation_window + 10)
    end = event_date + pd.Timedelta(days=event_window + 10)
    
    # Load stock data
    stock = get_stock_data(ticker, start, end)
    if stock.empty:
        print(f"No stock data for {ticker} around {event_date.date()}")
        return None
    
    # Load benchmark data
    try:
        market = pd.read_csv(benchmark_file, index_col=0)
    except FileNotFoundError:
        print(f"Benchmark file not found: {benchmark_file}")
        return None

    market.index = pd.to_datetime(market.index, errors='coerce')
    market = market[market.index.notna()]
    
    if 'Close' not in market.columns:
        print(f"No 'Close' column in benchmark {benchmark_file}")
        return None
    market['Close'] = pd.to_numeric(market['Close'], errors='coerce')
    market = market.dropna(subset=['Close'])
    
    # Filter by event window
    market = market[(market.index >= start) & (market.index <= end)]
    if market.empty:
        print(f"No benchmark data for {ticker} around {event_date.date()}")
        return None
    
    # Compute returns
    stock['Return'] = stock['Close'].pct_change()
    market['MarketReturn'] = market['Close'].pct_change()
    
    df = pd.concat([stock['Return'], market['MarketReturn']], axis=1).dropna()
    if df.empty:
        print(f"No overlapping return data for {ticker} and benchmark around {event_date.date()}")
        return None
    
    # Estimate beta using estimation window
    est_df = df[df.index < event_date].tail(estimation_window)
    if len(est_df) < 2:
        print(f"Not enough data to estimate beta for {ticker}")
        return None
    
    beta = np.cov(est_df['Return'], est_df['MarketReturn'])[0,1] / np.var(est_df['MarketReturn'])
    
    # Compute abnormal returns for event window
    event_df = df[(df.index >= event_date - pd.Timedelta(days=event_window)) &
                  (df.index <= event_date + pd.Timedelta(days=event_window))]
    if event_df.empty:
        print(f"No data in event window for {ticker}")
        return None
    
    event_df['AbnormalReturn'] = event_df['Return'] - beta * event_df['MarketReturn']
    
    # Cumulative abnormal return
    CAR = event_df['AbnormalReturn'].sum()
    
    return CAR
results = []
for idx, row in df_transcripts.iterrows():
    ticker = row['ticker']
    event_date = row['mostImportantDateUTC']
    CAR = event_study_single(ticker, event_date, event_window=3)
    results.append({"ticker": ticker, "event_date": event_date, "CAR": CAR})

In [None]:
## EVENT STUDY - CLEAN DATA
# Remove duplicate rows (keep the first occurrence)
df_car_clean = df_car.drop_duplicates()

# Drop rows where CAR is NaN (or other critical fields are missing)
df_car_clean = df_car_clean.dropna(subset=['CAR'])

# Optional: drop rows where ticker or event_date is missing too
df_car_clean = df_car_clean.dropna(subset=['ticker', 'event_date'])

# Reset index for a clean DataFrame
df_car_clean = df_car_clean.reset_index(drop=True)

# Save cleaned results
df_car_clean.to_csv("df_car_clean.csv", index=False)

print(df_car_clean.head(10))
print(df_car_clean.shape)

In [None]:
## EVENT STUDY - CORRELATION WITH SENTIMENT DATA

# === 1. Load Data ===
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Replace with your actual file paths
df_car = pd.read_csv("df_car_clean.csv")       # columns: ticker, event_date, CAR
df_sent = pd.read_csv("df_finalized.csv")    # columns: ticker, event_date, sentiment_score

# Ensure event_date is datetime
df_car['event_date'] = pd.to_datetime(df_car['event_date'])
df_sent['mostImportantDateUTC'] = pd.to_datetime(df_sent['mostImportantDateUTC'])

# Rename sentiment date column to match CAR dataset
df_sent = df_sent.rename(columns={"mostImportantDateUTC": "event_date"})

gvkey_ticker = pd.read_csv('merged_wrds_gvkey_V2.txt', sep='\t')
df_sent = df_sent.merge(gvkey_ticker[['gvkey', 'tic']], on='gvkey', how='left')
df_sent.rename(columns={'tic': 'ticker'}, inplace=True)
df_sent['ticker'] = df_sent['ticker'].astype(str).str.upper().str.strip()

# === 2. Merge on ticker & event_date ===
df_merged = pd.merge(
    df_car,
    df_sent[['ticker', 'event_date', 'total_sentiment_score', 'average_sentiment_score']],
    left_on=['ticker', 'event_date'],
    right_on=['ticker', 'event_date'],
    how='inner'
)
print(f"Merged dataset shape: {df_merged.shape}")
print(df_merged.head())

# === 3. Clean Data ===
df_merged = df_merged.drop(columns=['event_date'])

# === 4. Correlation ===
print(df_merged[['CAR', 'total_sentiment_score', 'average_sentiment_score']].corr())
model = smf.ols("CAR ~ total_sentiment_score + average_sentiment_score", data=df_merged).fit()
print(model.summary())

# Scatterplot
plt.scatter(df_merged['total_sentiment_score'], df_merged['CAR'], alpha=0.5, label="Total Sentiment")
plt.scatter(df_merged['average_sentiment_score'], df_merged['CAR'], alpha=0.5, label="Average Sentiment")
plt.axhline(0, color="red", linestyle="--")
plt.xlabel("Sentiment Score")
plt.ylabel("CAR")
plt.legend()
plt.show()


In [None]:
## EVENT STUDY - CORRELATION WITH SENTIMENT DATA

import statsmodels.api as sm
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

# Create a binary CAR column
df_merged["CAR_binary"] = (df_merged["CAR"] > 0).astype(int)

# Quick check
df_merged[["CAR", "CAR_binary"]].head()


# Define features and target
features = ["total_sentiment_score", "average_sentiment_score"]
X = df_merged[features]
X = sm.add_constant(X)  # adds intercept
y = df_merged["CAR_binary"]  # should already exist (CAR > 0 → 1, else 0)

# Fit the logit model
logit_both = sm.Logit(y, X).fit()
print(logit_both.summary())

def evaluate_logit(model, df, features):
    X = sm.add_constant(df[features])
    y = df["CAR_binary"]
    
    # Predictions (probabilities)
    y_pred_prob = model.predict(X)
    y_pred = (y_pred_prob > 0.5).astype(int)
    
    # Metrics
    auc = roc_auc_score(y, y_pred_prob)
    cm = confusion_matrix(y, y_pred)
    
    print("\n--- Logistic Regression Evaluation ---")
    print(f"Features: {features}")
    print(f"AUC: {auc:.3f}")
    print("Confusion Matrix:")
    print(cm)
    print("Classification Report:")
    print(classification_report(y, y_pred))
evaluate_logit(logit_both, df_merged, features)


In [None]:
## EVENT STUDY FIGURES

import matplotlib.pyplot as plt
import seaborn as sns

# Scatterplots with regression line
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
sns.regplot(x="total_sentiment_score", y="CAR", data=df_merged, 
            scatter_kws={'alpha':0.2}, line_kws={'color':'red'})
plt.title("CAR vs Total Sentiment Score")

plt.subplot(1, 2, 2)
sns.regplot(x="average_sentiment_score", y="CAR", data=df_merged, 
            scatter_kws={'alpha':0.2}, line_kws={'color':'red'})
plt.title("CAR vs Average Sentiment Score")

plt.tight_layout()
plt.savefig("scatterplots_sentiment_vs_CAR.png", dpi=300)  # save
plt.show()


# Boxplot: CAR grouped by sentiment sign
df_merged["sentiment_sign"] = (df_merged["average_sentiment_score"] > 0).astype(int)

plt.figure(figsize=(6,6))
sns.boxplot(x="sentiment_sign", y="CAR", data=df_merged)
plt.xticks([0,1], ["Negative Sentiment", "Positive Sentiment"])
plt.title("CAR Distribution by Sentiment Sign")
plt.savefig("boxplot_CAR_by_sentiment.png", dpi=300)  # save
plt.show()


# Histogram: CAR distribution
plt.figure(figsize=(10,6))
sns.histplot(df_merged["CAR"], bins=50, kde=True)
plt.title("Distribution of CAR")
plt.xlabel("CAR")
plt.savefig("histogram_CAR.png", dpi=300)  # save
plt.show()
