In [30]:
#>=50% of annotators agreed on the financial sentiment 
from pathlib import Path
import pandas as pd
import re

file_path = r'C:/Users/Tammy/Documents/GitHub/multimodal_stockprice_prediction/data/clean/stock_text_data/Apple_Inc_text_data.csv'
# Load the CSV file with the specified encoding and column names
df = pd.read_csv(file_path,encoding='utf-8')

# Cleaning function
def clean_text(text):
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation and special characters (except spaces)
    text = re.sub(r'[^\w\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text
# Check the first few rows of the data
df.head()
#reformat date
# format datetime again
df['pub_date'] = pd.to_datetime(df['pub_date']) 
df['pub_date'] = df['pub_date'].dt.date
#cleaning
# df['headline'] = df['headline'].apply(clean_text) # change this line for abstract
df['headline'] = df['headline'].apply(clean_text)
# Select only the 'pub_date' and 'headline' columns
df_new = df[['pub_date', 'headline']] #change this line for abstract


## Polarity

In [21]:
from pydantic import BaseModel

class SentimentTrendPrediction(BaseModel):
    SentimentScore: float   # Predicted price as a string (could be float)

In [22]:
## Setting up Gemini
#set up gemini to process textual templates
from google import genai
from dotenv import load_dotenv
from pydantic import BaseModel
import os
load_dotenv()
# Load environment variables from the .env file
API_KEY = os.getenv("API_KEY")
client = genai.Client(api_key=API_KEY)#insert api key
## Function to call Gemini API
from google.genai import types
import json

# Function to get sentiment based on model output
def gemini_predict_sentiment_trend(prompt, return_json=False):
    try:
        # Generate content from the model
        response = client.models.generate_content(
            model='gemini-2.0-flash-lite', 
            contents=prompt,
            config=types.GenerateContentConfig(
            system_instruction= f"""
            You are a financial analyst specializing in both quantitative market sentiment and trend analysis. Your task is to analyze each headline and provide:

            **Sentiment Score (-1.0 to 1.0)**:
            - -1.0: Extremely negative news (bankruptcy, massive fraud, severe regulatory action)
            - 0.0: Neutral or balanced news (mixed results, status quo maintained)
            - 1.0: Extremely positive news (breakthrough products, industry-changing acquisitions)
            - Consider both **direct effects on Apple’s stock price** and **general market or sector trends**.

            CRITICAL INSTRUCTIONS:
            - Evaluate Apple's stock movement within the broader tech industry context and the overall market sentiment.
            - Consider broader market trends, including economic events, industry-wide shifts, or global developments that may indirectly impact Apple or the tech sector. 
            - If the headline does not pertain to Apple or tech but has potential ripple effects, assign a neutral sentiment (0.0) or assess if it has any relevant impact on market sentiment as a whole.
            - For ambiguous headlines, default to low scores (-0.1 to +0.1 range).
            - Return ONLY one value in JSON format like:
            {{ "SentimentScore": 0.7 }}
            """
,
            max_output_tokens=75,  # Label only
            temperature=0.5,      # More flexibility
            top_k=5,              # Limit to top 5 choices
            top_p=0.7,            # Consider tokens covering 70% probability mass
            response_mime_type='application/json',
            response_schema=SentimentTrendPrediction
          # No stop sequence to avoid premature stops
            )
        )
        
        # Check if the response is valid
        response_text = response.text.strip()
        if not response_text:
            print("Error: No response text found.")
            return 'Error' if not return_json else {}

        # Try parsing the response as JSON
        try:
            response_json = json.loads(response_text)
            if return_json:
                return response_json

            return response_json

        except json.JSONDecodeError:
            print(f"Error parsing JSON. Response: {response_text}")
            return 'Error' if not return_json else {}

    except Exception as e:
        print(f"API Error: {e}")
        return 'Error' if not return_json else {}

In [None]:
## Experiment with few-shot prompting
def analyze_sentiment_trend(text): #alter prompt for abstract
    prompt = f"""
    Analyze this financial headline and provide: 
    Sentiment Score: from -1.0 (extremely negative) to 1.0 (extremely positive)

    Return results in JSON format like:
    {{ "SentimentScore": X.X }}

    Here are some examples:

    Example 1: 
    Headline: "Company Close to Finalizing Its 40 billion dollar funding." 
    {{ "SentimentScore": 0.9}}

    Example 2: 
    Headline: "Regulatory authorities block 10% of funds for key agency in US-China Tech Race."
    {{ "SentimentScore": -0.8 }}

    Example 3: 
    Headline: "Why Company B could be a key to a Company C's Deal."
    {{ "SentimentScore": 0.2 }}

    Example 4: 
    Headline: "Artificial intelligence boom might help mitigate some tariff pain."
    {{ "SentimentScore": 0.3}}

    Example 5: 
    Headline: "Major banks face regulatory hurdles, impacting earnings outlook."
    {{ "SentimentScore": -0.6 }}

    Example 6: 
    Headline: "Company's $32 billion deal may signal a turning point for slow IPO, M&A markets."
    {{ "SentimentScore": 0.8 }}

    Example 7: 
    Headline: "Company X enters a strategic partnership with Company Y to expand its operations in Asia."
    Sentiment Score: 0.7
    {{ "SentimentScore": 0.7 }}

    Now, analyze this headline:

    Headline: "{text}"
    Response (in JSON format):
   {{ "SentimentScore": }}"""
    
    result = gemini_predict_sentiment_trend(prompt)
    return result

## Impact Duration

In [24]:
class DurationPrediction(BaseModel):
    PotentialImpactDays: int

In [25]:
# Function for duration prediction
def gemini_predict_duration(prompt, return_json=False):
    try:
        # Generate content from the model
        response = client.models.generate_content(
            model='gemini-2.0-flash-lite', 
            contents=prompt,
            config=types.GenerateContentConfig(
                system_instruction="""
                You are a financial market expert specializing in estimating the impact duration of financial news. Your task is to analyze each headline and estimate how long its impact will last on the stock market.

                Consider:
                - **1-3 days**: Short-lived news (minor developments, non-urgent reactions).
                - **4-7 days**: Moderately impactful news (earnings reports, key regulatory actions).
                - **8-14 days**: Significant developments (corporate mergers, product announcements).
                - **15-30 days**: Major structural changes (industry shifts, global economic impacts).

                For Apple-specific news, consider historical precedents for how similar news has impacted Apple's stock in the past. For market-wide or industry news, estimate how long the ripple effects might last, considering Apple's role in the tech sector.

                Return only the result in JSON format like:
                {{ "PotentialImpactDays": Z }}
                """,
                max_output_tokens=50,
                temperature=0.5,
                top_k=5,
                top_p=0.7,
                response_mime_type='application/json',
                response_schema=DurationPrediction
            )
        )
        
        # Check if the response is valid
        response_text = response.text.strip()
        if not response_text:
            print("Error: No response text found.")
            return 'Error' if not return_json else {}

        # Try parsing the response as JSON
        try:
            response_json = json.loads(response_text)
            if return_json:
                return response_json

            return response_json

        except json.JSONDecodeError:
            print(f"Error parsing JSON. Response: {response_text}")
            return 'Error' if not return_json else {}

    except Exception as e:
        print(f"API Error: {e}")
        return 'Error' if not return_json else {}

In [26]:
def analyze_duration(text):
    prompt = f"""
    Analyze this financial headline and estimate how many days (1-30) this news might impact the market.
    Consider: 
    - 1-3 days for short-lived news
    - 4-7 days for moderately impactful news 
    - 8-14 days for significant developments
    - 15-30 days for major structural changes or significant corporate events
    Additionally, for Apple-specific news (e.g., product launches, earnings results), consider the historical reaction of Apple's stock and the potential market sentiment based on previous similar events.

    For market-wide news (e.g., interest rate changes, regulatory updates), estimate how it will ripple through the tech sector and its potential impact on Apple stock.

    Return result in JSON format like:
    {{ "PotentialImpactDays": Z }}

    Examples:

    Example 1: 
    Headline: "Company close to finalizing its 40 billion dollar funding." 
    {{ "PotentialImpactDays": 14 }}

    Example 2: 
    Headline: "Regulatory authorities block 10% of funds for key agency in US-China Tech Race."
    {{ "PotentialImpactDays": 7 }}

    Example 3: 
    Headline: "Why Company B could be a key to a Company C's Deal."
    {{ "PotentialImpactDays": 3 }}

    Example 4: 
    Headline: "Artificial intelligence boom might help mitigate some tariff pain."
    {{ "PotentialImpactDays": 5 }}

    Example 5: 
    Headline: "Major banks face regulatory hurdles, impacting earnings outlook."
    {{ "PotentialImpactDays": 10 }}

    Example 6: 
    Headline: "Company's $32 billion deal may signal a turning point for slow IPO, M&A markets."
    {{ "PotentialImpactDays": 10 }}

    Now analyze this headline:
    Headline: "{text}"
    """
    
    result = gemini_predict_duration(prompt)
    return result

## Joining data

In [31]:
from tqdm import tqdm

def process_headlines_with_separate_models(df, sentiment_column):
    # sentiment_list = []
    sentiment_score_list = []
    # trend_strength_list = []
    impact_days_list = []
    
    # Process with first model (sentiment and trend)
    for idx, headline in tqdm(enumerate(df[sentiment_column]), total=len(df), desc="Processing Sentiment", unit="headline"):
        # Get sentiment and trend predictions
        result = analyze_sentiment_trend(headline)
        
        if isinstance(result, dict) and 'SentimentScore' in result:
            sentiment_score = result['SentimentScore']
            
            # Store values
            sentiment_score_list.append(sentiment_score)
        #     # Determine categorical sentiment
        #     if sentiment_score == 1.0:
        #         sentiment = 'Extremely Positive'
        #     elif sentiment_score >= 0.8:
        #         sentiment = 'Very Positive'
        #     elif sentiment_score >= 0.5:
        #         sentiment = 'Moderately Positive'
        #     elif sentiment_score >= 0.2:
        #         sentiment = 'Slightly Positive'
        #     elif sentiment_score == 0.0:
        #         sentiment = 'Neutral'
        #     elif sentiment_score >= -0.2:
        #         sentiment = 'Slightly Negative'
        #     elif sentiment_score >= -0.5:
        #         sentiment = 'Moderately Negative'
        #     elif sentiment_score >= -0.8:
        #         sentiment = 'Very Negative'
        #     else:
        #         sentiment = 'Extremely Negative'
            
        #     sentiment_list.append(sentiment)
        # else:
        #     # Handle error case
        #     sentiment_list.append('Error')
        #     sentiment_score_list.append(None)
        #     trend_strength_list.append(None)
    
    # Add sentiment and trend to DataFrame
    # df['gemini_sentiment'] = sentiment_list
    df['sentiment_score'] = sentiment_score_list
    # df['trend_strength'] = trend_strength_list
    
    # Process with second model (duration)
    for idx, headline in tqdm(enumerate(df[sentiment_column]), total=len(df), desc="Processing Duration", unit="headline"):
        # Get duration prediction
        result = analyze_duration(headline)
        
        if isinstance(result, dict) and 'PotentialImpactDays' in result:
            impact_days = result['PotentialImpactDays']
            impact_days_list.append(impact_days)
        else:
            # Handle error case
            impact_days_list.append(None)
    
    # Add duration to DataFrame
    df['impact_days'] = impact_days_list
    
    return df

# Apply the separate models
df_analyzed = process_headlines_with_separate_models(df_new, 'headline')

# Display results
print(df_analyzed[['pub_date', 'headline', 'sentiment_score', 'impact_days']])

# Save to CSV
df_analyzed.to_csv('gemini_headline_features_predictions.csv', index= True)

Processing Sentiment: 100%|██████████| 2143/2143 [38:39<00:00,  1.08s/headline]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment_score'] = sentiment_score_list
Processing Duration:   0%|          | 7/2143 [00:12<1:05:22,  1.84s/headline]


KeyboardInterrupt: 

In [29]:
df_analyzed.head()

Unnamed: 0,pub_date,headline,sentiment_score,impact_days
0,2015-04-07,mba programs that get you where you want to go,0.0,3
1,2015-04-14,what were reading,0.0,1
2,2015-04-13,ibm creates watson health to analyze medical data,0.4,8
3,2015-04-22,whats that on beyoncs wrist let me guess an ap...,0.1,1
4,2015-04-01,daily report tech leaders come together to opp...,-0.2,7


# Archive to be ignored

In [10]:
from pydantic import BaseModel

class SentimentTrendPrediction(BaseModel):
    SentimentScore: float
    TrendStrength: float    # Predicted price as a string (could be float)

In [11]:
## Setting up Gemini
#set up gemini to process textual templates
from google import genai
from dotenv import load_dotenv
from pydantic import BaseModel
import os
load_dotenv()
# Load environment variables from the .env file
API_KEY = os.getenv("API_KEY")
client = genai.Client(api_key=API_KEY)#insert api key
## Function to call Gemini API
from google.genai import types
import json

# Function to get sentiment based on model output
def gemini_predict_sentiment_trend(prompt, return_json=False):
    try:
        # Generate content from the model
        response = client.models.generate_content(
            model='gemini-2.0-flash-lite', 
            contents=prompt,
            config=types.GenerateContentConfig(
            system_instruction= f"""
            You are a financial analyst specializing in both quantitative market sentiment and trend analysis. Your task is to analyze each headline and provide two precise metrics:

            1. **Sentiment Score (-1.0 to 1.0)**:
            - -1.0: Extremely negative news (bankruptcy, massive fraud, severe regulatory action)
            - 0.0: Neutral or balanced news (mixed results, status quo maintained)
            - 1.0: Extremely positive news (breakthrough products, industry-changing acquisitions)
            - Consider both **direct effects on Apple’s stock price** and **general market or sector trends**.

            2. **Trend Strength (-1.0 to 1.0)**:
            - -1.0: Strong downward trend (systemic issues, prolonged negative impact)
            - 0.0: Neutral impact (unlikely to drive directional market movement)
            - 1.0: Strong upward trend (transformative positive developments)
            - Evaluate whether the headline indicates a short-term price reaction or a long-term trend based on Apple’s role in the tech sector**.

            CRITICAL INSTRUCTIONS:
            - Differentiate Sentiment (emotional tone) vs. Trend Strength (market momentum).
            - Evaluate Apple's stock movement within the broader tech industry context and the overall market sentiment.
            - For ambiguous headlines, default to moderate scores (-0.5 to +0.5 range).
            - Return ONLY the two values in JSON format like:
            {{ "SentimentScore": 0.5, "TrendStrength": 0.6 }}
            """
,
            max_output_tokens=75,  # Label only
            temperature=0.5,      # More flexibility
            top_k=5,              # Limit to top 5 choices
            top_p=0.7,            # Consider tokens covering 70% probability mass
            response_mime_type='application/json',
            response_schema=SentimentTrendPrediction
          # No stop sequence to avoid premature stops
            )
        )
        
        # Check if the response is valid
        response_text = response.text.strip()
        if not response_text:
            print("Error: No response text found.")
            return 'Error' if not return_json else {}

        # Try parsing the response as JSON
        try:
            response_json = json.loads(response_text)
            if return_json:
                return response_json

            return response_json

        except json.JSONDecodeError:
            print(f"Error parsing JSON. Response: {response_text}")
            return 'Error' if not return_json else {}

    except Exception as e:
        print(f"API Error: {e}")
        return 'Error' if not return_json else {}

## For Headlines

In [12]:
## Experiment with few-shot prompting
def analyze_sentiment_trend(text): #alter prompt for abstract
    prompt = f"""
    Analyze this financial headline and provide two metrics: 
    1. Sentiment Score: from -1.0 (extremely negative) to 1.0 (extremely positive)
    2. Trend Strength: from -1.0 (strong downward trend) to 1.0 (strong upward trend)

    Return results in JSON format like:
    {{ "SentimentScore": X.X, "TrendStrength": Y.Y }}

    Here are some examples with SentimentScore and TrendStrength:

    Example 1: 
    Headline: "Company Close to Finalizing Its 40 billion dollar funding." 
    {{ "SentimentScore": 0.9, "TrendStrength": 0.8 }}

    Example 2: 
    Headline: "Regulatory authorities block 10% of funds for key agency in US-China Tech Race."
    {{ "SentimentScore": -0.8, "TrendStrength": -0.6 }}

    Example 3: 
    Headline: "Why Company B could be a key to a Company C's Deal."
    {{ "SentimentScore": 0.2, "TrendStrength": 0.1 }}

    Example 4: 
    Headline: "Artificial intelligence boom might help mitigate some tariff pain."
    {{ "SentimentScore": 0.3, "TrendStrength": 0.2 }}

    Example 5: 
    Headline: "Major banks face regulatory hurdles, impacting earnings outlook."
    {{ "SentimentScore": -0.6, "TrendStrength": -0.5 }}

    Example 6: 
    Headline: "Company's $32 billion deal may signal a turning point for slow IPO, M&A markets."
    {{ "SentimentScore": 0.8, "TrendStrength": 0.7 }}

    Example 7: 
    Headline: "Company X enters a strategic partnership with Company Y to expand its operations in Asia."
    Sentiment Score: 0.7
    {{ "SentimentScore": 0.7, "TrendStrength": 0.6 }}

    Now, analyze this headline:

    Headline: "{text}"
    Response (in JSON format):
   {{ "SentimentScore": , "TrendStrength": }}"""
    
    result = gemini_predict_sentiment_trend(prompt)
    return result

### Function for news impact duration

In [13]:
class DurationPrediction(BaseModel):
    PotentialImpactDays: int

In [14]:
# Function for duration prediction
def gemini_predict_duration(prompt, return_json=False):
    try:
        # Generate content from the model
        response = client.models.generate_content(
            model='gemini-2.0-flash-lite', 
            contents=prompt,
            config=types.GenerateContentConfig(
                system_instruction="""
                You are a financial market expert specializing in estimating the impact duration of financial news. Your task is to analyze each headline and estimate how long its impact will last on the stock market.

                Consider:
                - **1-3 days**: Short-lived news (minor developments, non-urgent reactions).
                - **4-7 days**: Moderately impactful news (earnings reports, key regulatory actions).
                - **8-14 days**: Significant developments (corporate mergers, product announcements).
                - **15-30 days**: Major structural changes (industry shifts, global economic impacts).

                For Apple-specific news, consider historical precedents for how similar news has impacted Apple's stock in the past. For market-wide or industry news, estimate how long the ripple effects might last, considering Apple's role in the tech sector.

                Return only the result in JSON format like:
                {{ "PotentialImpactDays": Z }}
                """,
                max_output_tokens=50,
                temperature=0.5,
                top_k=5,
                top_p=0.7,
                response_mime_type='application/json',
                response_schema=DurationPrediction
            )
        )
        
        # Check if the response is valid
        response_text = response.text.strip()
        if not response_text:
            print("Error: No response text found.")
            return 'Error' if not return_json else {}

        # Try parsing the response as JSON
        try:
            response_json = json.loads(response_text)
            if return_json:
                return response_json

            return response_json

        except json.JSONDecodeError:
            print(f"Error parsing JSON. Response: {response_text}")
            return 'Error' if not return_json else {}

    except Exception as e:
        print(f"API Error: {e}")
        return 'Error' if not return_json else {}

In [15]:
def analyze_duration(text):
    prompt = f"""
    Analyze this financial headline and estimate how many days (1-30) this news might impact the market.
    Consider: 
    - 1-3 days for short-lived news
    - 4-7 days for moderately impactful news 
    - 8-14 days for significant developments
    - 15-30 days for major structural changes or significant corporate events
    Additionally, for Apple-specific news (e.g., product launches, earnings results), consider the **historical reaction of Apple's stock and the potential market sentiment based on previous similar events.

    For market-wide news (e.g., interest rate changes, regulatory updates), estimate how it will ripple through the tech sector and its potential impact on Apple stock.

    Return result in JSON format like:
    {{ "PotentialImpactDays": Z }}

    Examples:

    Example 1: 
    Headline: "Company close to finalizing its 40 billion dollar funding." 
    {{ "PotentialImpactDays": 14 }}

    Example 2: 
    Headline: "Regulatory authorities block 10% of funds for key agency in US-China Tech Race."
    {{ "PotentialImpactDays": 7 }}

    Example 3: 
    Headline: "Why Company B could be a key to a Company C's Deal."
    {{ "PotentialImpactDays": 3 }}

    Example 4: 
    Headline: "Artificial intelligence boom might help mitigate some tariff pain."
    {{ "PotentialImpactDays": 5 }}

    Example 5: 
    Headline: "Major banks face regulatory hurdles, impacting earnings outlook."
    {{ "PotentialImpactDays": 10 }}

    Example 6: 
    Headline: "Company's $32 billion deal may signal a turning point for slow IPO, M&A markets."
    {{ "PotentialImpactDays": 10 }}

    Now analyze this headline:
    Headline: "{text}"
    """
    
    result = gemini_predict_duration(prompt)
    return result

In [16]:
from tqdm import tqdm

def process_headlines_with_separate_models(df, sentiment_column):
    sentiment_list = []
    sentiment_score_list = []
    trend_strength_list = []
    impact_days_list = []
    
    # Process with first model (sentiment and trend)
    for idx, headline in tqdm(enumerate(df[sentiment_column]), total=len(df), desc="Processing Sentiment & Trend", unit="headline"):
        # Get sentiment and trend predictions
        result = analyze_sentiment_trend(headline)
        
        if isinstance(result, dict) and 'SentimentScore' in result:
            sentiment_score = result['SentimentScore']
            trend_strength = result['TrendStrength']
            
            # Store values
            sentiment_score_list.append(sentiment_score)
            trend_strength_list.append(trend_strength)
            
            # Determine categorical sentiment
            if sentiment_score == 1.0:
                sentiment = 'Extremely Positive'
            elif sentiment_score >= 0.8:
                sentiment = 'Very Positive'
            elif sentiment_score >= 0.5:
                sentiment = 'Moderately Positive'
            elif sentiment_score >= 0.2:
                sentiment = 'Slightly Positive'
            elif sentiment_score == 0.0:
                sentiment = 'Neutral'
            elif sentiment_score >= -0.2:
                sentiment = 'Slightly Negative'
            elif sentiment_score >= -0.5:
                sentiment = 'Moderately Negative'
            elif sentiment_score >= -0.8:
                sentiment = 'Very Negative'
            else:
                sentiment = 'Extremely Negative'
            
            sentiment_list.append(sentiment)
        else:
            # Handle error case
            sentiment_list.append('Error')
            sentiment_score_list.append(None)
            trend_strength_list.append(None)
    
    # Add sentiment and trend to DataFrame
    df['gemini_sentiment'] = sentiment_list
    df['sentiment_score'] = sentiment_score_list
    df['trend_strength'] = trend_strength_list
    
    # Process with second model (duration)
    for idx, headline in tqdm(enumerate(df[sentiment_column]), total=len(df), desc="Processing Duration", unit="headline"):
        # Get duration prediction
        result = analyze_duration(headline)
        
        if isinstance(result, dict) and 'PotentialImpactDays' in result:
            impact_days = result['PotentialImpactDays']
            impact_days_list.append(impact_days)
        else:
            # Handle error case
            impact_days_list.append(None)
    
    # Add duration to DataFrame
    df['impact_days'] = impact_days_list
    
    return df

# Apply the separate models
df_analyzed = process_headlines_with_separate_models(df_new, 'headline')

# Display results
print(df_analyzed[['pub_date', 'headline', 'gemini_sentiment', 'sentiment_score', 'trend_strength', 'impact_days']])

# Save to CSV
df_analyzed.to_csv('gemini_headline_features_predictions.csv')

Processing Sentiment & Trend: 100%|██████████| 10/10 [00:10<00:00,  1.00s/headline]
Processing Duration:  20%|██        | 2/10 [00:02<00:11,  1.38s/headline]


KeyboardInterrupt: 