In [8]:
#>=50% of annotators agreed on the financial sentiment 
from pathlib import Path
import pandas as pd
import re

file_path = r'C:/Users/Tammy/Documents/GitHub/multimodal_stockprice_prediction/data/clean/stock_text_data/Apple_Inc_text_data.csv'
# Load the CSV file with the specified encoding and column names
df = pd.read_csv(file_path,encoding='utf-8')

# Cleaning function
def clean_text(text):
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation and special characters (except spaces)
    text = re.sub(r'[^\w\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text
# Check the first few rows of the data
df.head()
#reformat date
# format datetime again
df['pub_date'] = pd.to_datetime(df['pub_date']) 
df['pub_date'] = df['pub_date'].dt.date
#cleaning
# df['headline'] = df['headline'].apply(clean_text) # change this line for abstract
df['abstract'] = df['abstract'].apply(clean_text)
# Select only the 'pub_date' and 'headline' columns
df_new = df[['pub_date', 'abstract']] #change this line for abstract
  
# Check the new DataFrame
df_new.head()

Unnamed: 0,pub_date,abstract
0,2015-04-07,want to work at amazon apple or mckinsey some ...
1,2015-04-14,get recommendations from new york times report...
2,2015-04-13,the business unit will partner with companies ...
3,2015-04-22,with superstars first in line apple appears to...
4,2015-04-01,in an industry that avoids controversy the hea...


In [9]:
from pydantic import BaseModel

class SentimentTrendPrediction(BaseModel):
    SentimentScore: float
    TrendStrength: float    # Predicted price as a string (could be float)

In [10]:
## Setting up Gemini
#set up gemini to process textual templates
from google import genai
from dotenv import load_dotenv
from pydantic import BaseModel
import os
load_dotenv()
# Load environment variables from the .env file
API_KEY = os.getenv("API_KEY")
client = genai.Client(api_key=API_KEY)#insert api key
## Function to call Gemini API
from google.genai import types
import json

# Function to get sentiment based on model output
def gemini_predict_sentiment_trend(prompt, return_json=False):
    try:
        # Generate content from the model
        response = client.models.generate_content(
            model='gemini-2.0-flash-lite', 
            contents=prompt,
            config=types.GenerateContentConfig(
            system_instruction= f"""
        
            You are a financial analyst specializing in quantitative market sentiment and trend analysis. Analyze each abstract and provide TWO precise metrics:

            1. SENTIMENT SCORE (-1.0 to 1.0):
            • -1.0: Catastrophic news (bankruptcy, massive fraud, severe regulatory action)
            • -0.8: Very negative news (significant earnings miss, major layoffs, lawsuits)
            • -0.5: Moderately negative news (missed targets, restructuring, competitive challenges)
            • -0.2: Slightly negative news (minor setbacks, cautious outlook)
            • 0.0: Neutral or balanced news (mixed results, status quo maintained)
            • 0.2: Slightly positive news (minor wins, modest growth)
            • 0.5: Moderately positive news (good earnings, new partnerships, expanding markets)
            • 0.8: Very positive news (exceeding expectations, major contracts, innovations)
            • 1.0: Transformational news (breakthrough products, industry-changing acquisitions)

            2. TREND STRENGTH (-1.0 to 1.0):
            • Consider: Is this likely to create a MARKET MOVEMENT (price change)?
            • -1.0: Strong, sustained downward pressure (systemic issues, prolonged negative impact)
            • -0.6: Significant downward momentum (clear negative catalysts)
            • -0.3: Slight downward pressure (minor concerns that may affect sentiment)
            • 0.0: Neutral impact (balanced factors, unlikely to drive directional movement)
            • 0.3: Slight upward potential (positive but limited catalyst)
            • 0.6: Significant upward momentum (clear positive catalysts)
            • 1.0: Strong, sustained upward potential (transformative positive developments)

            CRITICAL INSTRUCTIONS:
            - Differentiate between SENTIMENT (how positive/negative the news is) and TREND (likely market movement)
            - Consider sector-specific implications - news impacts different industries differently
            - Evaluate the scale of impact relative to company/market size
            - Distinguish between short-term reactions vs. fundamental changes
            - For ambiguous headlines, default to more moderate scores (-0.5 to +0.5 range)
            -  Return ONLY the two values in JSON format, e.g., {{"SentimentScore": 0.5, "TrendStrength": -0.4}}
            

"""
,
            max_output_tokens=75,  # Label only
            temperature=0.5,      # More flexibility
            top_k=5,              # Limit to top 5 choices
            top_p=0.7,            # Consider tokens covering 70% probability mass
            response_mime_type='application/json',
            response_schema=SentimentTrendPrediction
          # No stop sequence to avoid premature stops
            )
        )
        
        # Check if the response is valid
        response_text = response.text.strip()
        if not response_text:
            print("Error: No response text found.")
            return 'Error' if not return_json else {}

        # Try parsing the response as JSON
        try:
            response_json = json.loads(response_text)
            if return_json:
                return response_json

            return response_json

        except json.JSONDecodeError:
            print(f"Error parsing JSON. Response: {response_text}")
            return 'Error' if not return_json else {}

    except Exception as e:
        print(f"API Error: {e}")
        return 'Error' if not return_json else {}

## For Abstract

In [11]:
## Experiment with few-shot prompting
def analyze_sentiment_trend(text): #alter prompt for abstract
    prompt = f"""
    Analyze this financial abstract and provide two metrics: 
    1. Sentiment Score: from -1.0 (extremely negative) to 1.0 (extremely positive)
    2. Trend Strength: from -1.0 (strong downward trend) to 1.0 (strong upward trend)

    Return results in JSON format like:
    {{ "SentimentScore": X.X, "TrendStrength": Y.Y }}

    Here are some examples with SentimentScore and TrendStrength:

    Example 1: 
    Abstract: "Company Close to Finalizing Its 40 billion dollar funding." 
    {{ "SentimentScore": 0.9, "TrendStrength": 0.8 }}

    Example 2: 
    Abstract: "Regulatory authorities block 10% of funds for key agency in US-China Tech Race."
    {{ "SentimentScore": -0.8, "TrendStrength": -0.6 }}

    Example 3: 
    Abstract: "Why Company B could be a key to a Company C's Deal."
    {{ "SentimentScore": 0.2, "TrendStrength": 0.1 }}

    Example 4: 
    Abstract: "Artificial intelligence boom might help mitigate some tariff pain."
    {{ "SentimentScore": 0.3, "TrendStrength": 0.2 }}

    Example 5: 
    Abstract: "Major banks face regulatory hurdles, impacting earnings outlook."
    {{ "SentimentScore": -0.6, "TrendStrength": -0.5 }}

    Example 6: 
    Abstract: "Company's $32 billion deal may signal a turning point for slow IPO, M&A markets."
    {{ "SentimentScore": 0.8, "TrendStrength": 0.7 }}

    Example 7: 
    Abstract: "Company X enters a strategic partnership with Company Y to expand its operations in Asia."
    Sentiment Score: 0.7
    {{ "SentimentScore": 0.7, "TrendStrength": 0.6 }}

    Now, analyze this abstract:

    Abstract: "{text}"
    Response (in JSON format):
   {{ "SentimentScore": , "TrendStrength": }}"""
    
    result = gemini_predict_sentiment_trend(prompt)
    return result

### Function for news impact duration

In [12]:
class DurationPrediction(BaseModel):
    PotentialImpactDays: int

In [13]:
# Function for duration prediction
def gemini_predict_duration(prompt, return_json=False):
    try:
        # Generate content from the model
        response = client.models.generate_content(
            model='gemini-2.0-flash-lite', 
            contents=prompt,
            config=types.GenerateContentConfig(
                system_instruction="""
                You are a financial market expert specializing in assessing the impact duration of financial news. Your task is to:

                ESTIMATE POTENTIAL IMPACT DAYS: Predict how many days (1-30) this new abstract's impact might last in the market.
                   Consider: 1-3 days for short-lived news, 4-7 days for moderately impactful news, 8-14 days for significant developments,
                   15-30 days for major structural changes or significant corporate events.

                Consider the abstract's significance, the entities involved, the type of event, and historical precedents for similar news.
                Return ONLY the value in JSON format: {"PotentialImpactDays": Z}
                """,
                max_output_tokens=50,
                temperature=0.5,
                top_k=5,
                top_p=0.7,
                response_mime_type='application/json',
                response_schema=DurationPrediction
            )
        )
        
        # Check if the response is valid
        response_text = response.text.strip()
        if not response_text:
            print("Error: No response text found.")
            return 'Error' if not return_json else {}

        # Try parsing the response as JSON
        try:
            response_json = json.loads(response_text)
            if return_json:
                return response_json

            return response_json

        except json.JSONDecodeError:
            print(f"Error parsing JSON. Response: {response_text}")
            return 'Error' if not return_json else {}

    except Exception as e:
        print(f"API Error: {e}")
        return 'Error' if not return_json else {}

In [14]:
def analyze_duration(text):
    prompt = f"""
    Analyze this financial abstract and estimate how many days (1-30) this news might impact the market.
    Consider: 
    - 1-3 days for short-lived news
    - 4-7 days for moderately impactful news 
    - 8-14 days for significant developments
    - 15-30 days for major structural changes or significant corporate events

    Return result in JSON format like:
    {{ "PotentialImpactDays": Z }}

    Examples:

    Example 1: 
    Abstract: "Company close to finalizing its 40 billion dollar funding." 
    {{ "PotentialImpactDays": 14 }}

    Example 2: 
    Abstract: "Regulatory authorities block 10% of funds for key agency in US-China Tech Race."
    {{ "PotentialImpactDays": 7 }}

    Example 3: 
    Abstract: "Why Company B could be a key to a Company C's Deal."
    {{ "PotentialImpactDays": 3 }}

    Example 4: 
    Abstract: "Artificial intelligence boom might help mitigate some tariff pain."
    {{ "PotentialImpactDays": 5 }}

    Example 5: 
    Abstract: "Major banks face regulatory hurdles, impacting earnings outlook."
    {{ "PotentialImpactDays": 10 }}

    Example 6: 
    Abstract: "Company's $32 billion deal may signal a turning point for slow IPO, M&A markets."
    {{ "PotentialImpactDays": 21 }}

    Now analyze this abstract:
    Abstract: "{text}"
    """
    
    result = gemini_predict_duration(prompt)
    return result

In [15]:
from tqdm import tqdm

def process_abstracts_with_separate_models(df, sentiment_column):
    sentiment_list = []
    sentiment_score_list = []
    trend_strength_list = []
    impact_days_list = []
    
    # Process with first model (sentiment and trend)
    for idx, abstract in tqdm(enumerate(df[sentiment_column]), total=len(df), desc="Processing Sentiment & Trend", unit="abstract"):
        # Get sentiment and trend predictions
        result = analyze_sentiment_trend(abstract)
        
        if isinstance(result, dict) and 'SentimentScore' in result:
            sentiment_score = result['SentimentScore']
            trend_strength = result['TrendStrength']
            
            # Store values
            sentiment_score_list.append(sentiment_score)
            trend_strength_list.append(trend_strength)
            
            # Determine categorical sentiment
            if sentiment_score == 1.0:
                sentiment = 'Extremely Positive'
            elif sentiment_score >= 0.8:
                sentiment = 'Very Positive'
            elif sentiment_score >= 0.5:
                sentiment = 'Moderately Positive'
            elif sentiment_score >= 0.2:
                sentiment = 'Slightly Positive'
            elif sentiment_score == 0.0:
                sentiment = 'Neutral'
            elif sentiment_score >= -0.2:
                sentiment = 'Slightly Negative'
            elif sentiment_score >= -0.5:
                sentiment = 'Moderately Negative'
            elif sentiment_score >= -0.8:
                sentiment = 'Very Negative'
            else:
                sentiment = 'Extremely Negative'
            
            sentiment_list.append(sentiment)
        else:
            # Handle error case
            sentiment_list.append('Error')
            sentiment_score_list.append(None)
            trend_strength_list.append(None)
    
    # Add sentiment and trend to DataFrame
    df['gemini_sentiment'] = sentiment_list
    df['sentiment_score'] = sentiment_score_list
    df['trend_strength'] = trend_strength_list
    
    # Process with second model (duration)
    for idx, abstract in tqdm(enumerate(df[sentiment_column]), total=len(df), desc="Processing Duration", unit="abstract"):
        # Get duration prediction
        result = analyze_duration(abstract)
        
        if isinstance(result, dict) and 'PotentialImpactDays' in result:
            impact_days = result['PotentialImpactDays']
            impact_days_list.append(impact_days)
        else:
            # Handle error case
            impact_days_list.append(None)
    
    # Add duration to DataFrame
    df['impact_days'] = impact_days_list
    
    return df

# Apply the separate models
df_analyzed = process_abstracts_with_separate_models(df_new, 'abstract')

# Display results
print(df_analyzed[['pub_date', 'abstract', 'gemini_sentiment', 'sentiment_score', 'trend_strength', 'impact_days']])

# Save to CSV
df_analyzed.to_csv('gemini_abstract_features_predictions.csv')

Processing Sentiment & Trend:   0%|          | 0/2143 [00:00<?, ?abstract/s]

Processing Sentiment & Trend: 100%|██████████| 2143/2143 [32:52<00:00,  1.09abstract/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['gemini_sentiment'] = sentiment_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment_score'] = sentiment_score_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['trend_strength'] = trend_str

        pub_date                                           abstract  \
0     2015-04-07  want to work at amazon apple or mckinsey some ...   
1     2015-04-14  get recommendations from new york times report...   
2     2015-04-13  the business unit will partner with companies ...   
3     2015-04-22  with superstars first in line apple appears to...   
4     2015-04-01  in an industry that avoids controversy the hea...   
...          ...                                                ...   
2138  2024-01-19  the vision pro augmented reality device goes o...   
2139  2024-01-25  an eu law taking effect in march forced apple ...   
2140  2024-01-31  parents movie buffs and office workers might a...   
2141  2024-01-05  justice department officials are in the late s...   
2142  2024-01-18  a wearable tech brand has followed crocs lead ...   

         gemini_sentiment  sentiment_score  trend_strength  impact_days  
0       Slightly Positive              0.2             0.1            3  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['impact_days'] = impact_days_list
