In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the dataset
def load_data(file_path):
    print(f"Loading data from {file_path}")
    df = pd.read_csv(file_path)
    print(f"Dataset loaded with shape: {df.shape}")
    return df

In [4]:

input_file = "gold-dataset-sinha-khandait.csv"
df = load_data(input_file)

Loading data from gold-dataset-sinha-khandait.csv
Dataset loaded with shape: (10570, 10)


In [9]:
df["Dates"].unique()

array(['28-01-2016', '13-09-2017', '26-07-2016', ..., '05-11-2009',
       '11-06-2002', '01-10-2007'], dtype=object)

In [None]:
# Load the finbert-regressor model and tokenizer
def load_model():
    print("Loading FinBERT model and tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained("LHF/finbert-regressor")
    model = AutoModelForSequenceClassification.from_pretrained("LHF/finbert-regressor")
    return model, tokenizer

Date as the index, every single data has a value  \
Exponential decay and linear decay  \
Data should be a dataframe  \
Average values if there's more than one in a day  

In [18]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import numpy as np
from datetime import datetime
import re

# Load the dataset
def load_data(file_path):
    print(f"Loading data from {file_path}")
    df = pd.read_csv(file_path)
    print(f"Dataset loaded with shape: {df.shape}")
    return df

# Load the FinBERT model and tokenizer
def load_model():
    print("Loading FinBERT model and tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained("LHF/finbert-regressor")
    model = AutoModelForSequenceClassification.from_pretrained("LHF/finbert-regressor")
    return model, tokenizer

# Function to apply manual corrections for known problematic dates
def apply_manual_date_corrections(idx, date_str, url):
    manual_corrections = {
        3259: pd.Timestamp('2004-03-19'),  # '0200-03-19' -> March 19, 2004
        3674: pd.Timestamp('2001-03-14'),  # '0200-03-14' -> March 14, 2001
        9253: pd.Timestamp('2009-03-10'),  # '0200-03-10' -> March 10, 2009
        9750: pd.Timestamp('2004-03-11')   # '0200-03-11' -> March 11, 2004
    }
    
    if idx in manual_corrections:
        print(f"Applied manual correction for index {idx}: {date_str} -> {manual_corrections[idx]}")
        return manual_corrections[idx]
    
    return None

# Function to extract date from URL
def extract_date_from_url(url):
    date_patterns = [
        r'(\d{4})[/-](\d{1,2})[/-](\d{1,2})$',  # YYYY-MM-DD or YYYY/MM/DD at the end
        r'(\d{4})[/-](\d{1,2})[/-](\d{1,2})',    # YYYY-MM-DD or YYYY/MM/DD anywhere
        r'(\d{1,2})[/-](\d{1,2})[/-](\d{4})$',   # DD-MM-YYYY or DD/MM/YYYY at the end
        r'(\d{1,2})[/-](\d{1,2})[/-](\d{4})'     # DD-MM-YYYY or DD/MM/YYYY anywhere
    ]
    
    if pd.isna(url):
        return None
    
    for pattern in date_patterns:
        match = re.search(pattern, url)
        if match:
            groups = match.groups()
            if len(groups[0]) == 4:
                year, month, day = groups
            else:
                day, month, year = groups
                
            try:
                date_str = f"{int(year):04d}-{int(month):02d}-{int(day):02d}"
                return pd.to_datetime(date_str)
            except:
                continue
    
    return None

# Helper function to parse dates with different formats
def parse_dates(row_idx, date_str, url=None):
    manual_correction = apply_manual_date_corrections(row_idx, date_str, url)
    if manual_correction is not None:
        return manual_correction
    
    if pd.isna(date_str):
        return pd.NaT
    
    date_str = str(date_str).strip()
    
    if date_str.startswith('0'):
        if url is not None:
            url_date = extract_date_from_url(url)
            if url_date is not None:
                return url_date
    
    for fmt in ("%d-%m-%Y", "%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y", "%m-%d-%Y"):
        try:
            return pd.to_datetime(date_str, format=fmt)
        except ValueError:
            continue
    
    try:
        return pd.to_datetime(date_str)
    except:
        if url is not None:
            url_date = extract_date_from_url(url)
            if url_date is not None:
                return url_date
        
    return pd.NaT

# Function to calculate sentiment scores in batches
def calculate_sentiment_scores(texts, model, tokenizer, batch_size=16, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model.to(device)
    model.eval()
    
    scores = []
    
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
        batch_texts = texts[i:i+batch_size]
        batch_texts = [str(text) if not pd.isna(text) else "" for text in batch_texts]
        
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            batch_scores = outputs.logits.squeeze(-1).cpu().numpy()
            scores.extend(batch_scores)
    
    return scores

def process_data_by_date(df):
    original_df = df.copy()
    print("Converting dates to datetime format...")
    
    df['Parsed_Date'] = df.apply(lambda row: parse_dates(row.name, row['Dates'], row['URL']), axis=1)
    
    invalid_dates_mask = df['Parsed_Date'].isna()
    invalid_dates_count = invalid_dates_mask.sum()
    
    if invalid_dates_count > 0:
        print(f"\nWARNING: Found {invalid_dates_count} rows with invalid dates:")
        invalid_dates_df = df[invalid_dates_mask][['Dates', 'URL']].reset_index()
        for _, row in invalid_dates_df.iterrows():
            print(f"  Index {row['index']}, Date value: '{row['Dates']}', URL: '{row['URL']}'")
        
        df = df[~invalid_dates_mask].copy()
    
    df['Dates'] = df['Parsed_Date']
    df.drop('Parsed_Date', axis=1, inplace=True)
    
    model, tokenizer = load_model()
    news_headlines = df["News"].tolist()
    sentiment_scores = calculate_sentiment_scores(news_headlines, model, tokenizer)
    df["Sentiment_Score"] = sentiment_scores
    
    df["Sentiment_Label"] = df["Sentiment_Score"].apply(get_sentiment_label)
    
    daily_sentiment = df.groupby('Dates')['Sentiment_Score'].mean().reset_index()
    
    min_date = daily_sentiment['Dates'].min()
    max_date = daily_sentiment['Dates'].max()
    all_dates = pd.date_range(start=min_date, end=max_date, freq='D')
    
    complete_date_df = pd.DataFrame({'Dates': all_dates})
    result_df = pd.merge(complete_date_df, daily_sentiment, on='Dates', how='left')
    result_df['Sentiment_Score'] = result_df['Sentiment_Score'].fillna(0)
    result_df["Sentiment_Label"] = result_df["Sentiment_Score"].apply(get_sentiment_label)
    
    # Standardize Price Sentiment labels
    df["Price Sentiment"] = df["Price Sentiment"].replace({'none': 'neutral'}).str.lower()
    df["Prediction"] = df["Sentiment_Label"].str.lower()
    
    # Save original predictions before modification
    df["Original_Prediction"] = df["Prediction"]
    df["Original_Score"] = df["Sentiment_Score"]
    
    print("\nComparing Original Model Predictions with Ground Truth:")
    comparison_counts = pd.crosstab(df["Price Sentiment"], df["Prediction"], 
                                   rownames=['Actual'], colnames=['Predicted'])
    print(comparison_counts)
    
    # Calculate original accuracy
    original_accuracy = (df["Price Sentiment"] == df["Prediction"]).mean()
    print(f"\nOriginal Model Accuracy: {original_accuracy:.4f}")
    
    # Create original confusion matrix
    print("\nOriginal Confusion Matrix:")
    conf_matrix = pd.crosstab(df["Price Sentiment"], df["Prediction"], 
                             rownames=['Actual'], colnames=['Predicted'], 
                             normalize='index')
    print(conf_matrix)
    
    # Set sentiment score to zero and prediction to neutral for entries with 'Price Sentiment' as 'none' or 'neutral'
    neutral_mask = df["Price Sentiment"].isin(['none', 'neutral'])
    df.loc[neutral_mask, "Sentiment_Score"] = 0
    df.loc[neutral_mask, "Prediction"] = "neutral"
    
    # Count how many predictions were changed
    changed_count = (df["Original_Prediction"] != df["Prediction"]).sum()
    print(f"\nUpdated {changed_count} predictions to match ground truth neutral labels")
    
    # Calculate updated accuracy
    updated_accuracy = (df["Price Sentiment"] == df["Prediction"]).mean()
    print(f"\nUpdated Model Accuracy: {updated_accuracy:.4f}")
    print(f"Accuracy Improvement: {updated_accuracy - original_accuracy:.4f}")
    
    # Create updated confusion matrix
    print("\nUpdated Confusion Matrix:")
    updated_conf_matrix = pd.crosstab(df["Price Sentiment"], df["Prediction"], 
                                     rownames=['Actual'], colnames=['Predicted'], 
                                     normalize='index')
    print(updated_conf_matrix)
    
    # Show differences between actual and predicted values after update
    print("\nDifferences Between Actual and Predicted Values After Update:")
    diff_df = df[df["Price Sentiment"] != df["Prediction"]]
    diff_counts = pd.crosstab(diff_df["Price Sentiment"], diff_df["Prediction"], 
                             rownames=['Actual'], colnames=['Predicted'])
    print(diff_counts)
    
    # Calculate accuracy for each class
    print("\nAccuracy by Class (After Update):")
    for sentiment in df["Price Sentiment"].unique():
        class_df = df[df["Price Sentiment"] == sentiment]
        class_accuracy = (class_df["Price Sentiment"] == class_df["Prediction"]).mean()
        print(f"  {sentiment.title()}: {class_accuracy:.4f}")
    
    return result_df, df, invalid_dates_df if invalid_dates_count > 0 else None

def get_sentiment_label(score):
    if score > 0.05:
        return "Positive"
    elif score < -0.05:
        return "Negative"
    else:
        return "Neutral"

def main():
    input_file = "gold-dataset-sinha-khandait.csv"
    output_file = "gold-dataset-with-sentiment.csv"
    daily_output_file = "gold-daily-sentiment.csv"
    error_output_file = "invalid-dates.csv"
    
    df = load_data(input_file)
    
    required_columns = ["Dates", "URL", "News", "Price Sentiment"]
    for col in required_columns:
        if col not in df.columns:
            print(f"Error: Required column '{col}' not found in the dataset")
            return
    
    daily_df, processed_df, invalid_dates_df = process_data_by_date(df)
    
    if invalid_dates_df is not None:
        print(f"Saving {len(invalid_dates_df)} invalid date entries to {error_output_file}")
        invalid_dates_df.to_csv(error_output_file, index=False)
    
    print(f"Saving daily sentiment results to {daily_output_file}")
    daily_df.to_csv(daily_output_file, index=False)
    
    print("\nDaily Sentiment Distribution:")
    print(daily_df["Sentiment_Label"].value_counts())
    print("\nDaily Sentiment Score Statistics:")
    print(daily_df["Sentiment_Score"].describe())
    
    print(f"Saving individual results to {output_file}")
    processed_df.to_csv(output_file, index=False)
    
    print("\nProcess completed successfully!")

if __name__ == "__main__":
    main()


Loading data from gold-dataset-sinha-khandait.csv
Dataset loaded with shape: (10570, 10)
Converting dates to datetime format...
Applied manual correction for index 3259: 0200-03-19 -> 2004-03-19 00:00:00
Applied manual correction for index 3674: 0200-03-14 -> 2001-03-14 00:00:00
Applied manual correction for index 9253: 0200-03-10 -> 2009-03-10 00:00:00
Applied manual correction for index 9750: 0200-03-11 -> 2004-03-11 00:00:00
Loading FinBERT model and tokenizer...


Processing batches: 100%|██████████| 661/661 [04:16<00:00,  2.58it/s]



Comparing Original Model Predictions with Ground Truth:
Predicted  negative  neutral  positive
Actual                                
negative       2618      831       365
neutral         444      807      1093
positive        366     1019      3027

Original Model Accuracy: 0.6104

Original Confusion Matrix:
Predicted  negative   neutral  positive
Actual                                 
negative   0.686418  0.217881  0.095700
neutral    0.189420  0.344283  0.466297
positive   0.082956  0.230961  0.686083

Updated 1537 predictions to match ground truth neutral labels

Updated Model Accuracy: 0.7558
Accuracy Improvement: 0.1454

Updated Confusion Matrix:
Predicted  negative   neutral  positive
Actual                                 
negative   0.686418  0.217881  0.095700
neutral    0.000000  1.000000  0.000000
positive   0.082956  0.230961  0.686083

Differences Between Actual and Predicted Values After Update:
Predicted  negative  neutral  positive
Actual                            

### Exponential Weighting of Sentiment Scores

In [1]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = "gold-daily-sentiment.csv"
output_file_path = "gold-daily-sentiment-with-weighting.csv"
df = pd.read_csv(file_path)

# Convert 'Dates' column to datetime
df['Dates'] = pd.to_datetime(df['Dates'], format='%d/%m/%Y')

# Sort the dataframe by date
df.sort_values('Dates', inplace=True)

# Duration for the effect of the sentiment score (30 days)
duration = 30  

# Create a new column for exponentially weighted sentiment scores
weighted_scores = np.zeros(len(df))

# Calculate weights
for i in range(len(df)):
    # Date of the current score
    date_of_score = df.iloc[i]['Dates']
    score = df.iloc[i]['Sentiment_Score']
    
    # Get the index where the effect is calculated
    effect_period = (df['Dates'] >= date_of_score) & (df['Dates'] <= (date_of_score + pd.Timedelta(days=duration)))
    
    # Apply e^(-time) weighting for scores during the 30-day period
    weights = np.exp(- (df['Dates'][effect_period] - date_of_score).dt.days / duration)
    
    # Update the weighted scores only for the 30-day effect range
    weighted_scores[effect_period] += score * weights

# Add the weighted scores to the dataframe
df['Exponential_Weighted_Score'] = weighted_scores

# Display the updated dataframe with the new column
print(df[['Dates', 'Sentiment_Score', 'Exponential_Weighted_Score']].head(10))

# df.to_csv(output_file_path, index=False)


       Dates  Sentiment_Score  Exponential_Weighted_Score
0 2000-02-15         0.599372                    0.599372
1 2000-02-16         0.000000                    0.579722
2 2000-02-17        -0.449040                    0.111677
3 2000-02-18         0.000000                    0.108016
4 2000-02-19         0.000000                    0.104475
5 2000-02-20         0.000000                    0.101050
6 2000-02-21         0.000000                    0.097737
7 2000-02-22         0.000000                    0.094533
8 2000-02-23         0.000000                    0.091433
9 2000-02-24         0.000000                    0.088436


In [2]:
df.describe()

Unnamed: 0,Dates,Sentiment_Score,Exponential_Weighted_Score
count,6927,6927.0,6927.0
mean,2009-08-09 00:00:00,0.015507,0.303216
min,2000-02-15 00:00:00,-0.997891,-2.59953
25%,2004-11-11 12:00:00,0.0,-0.195825
50%,2009-08-09 00:00:00,0.0,0.249545
75%,2014-05-06 12:00:00,0.045997,0.83191
max,2019-02-01 00:00:00,1.0,2.687934
std,,0.18239,0.766284
