In [1]:
#>=50% of annotators agreed on the financial sentiment 
from pathlib import Path
import pandas as pd

file_path = r'C:/Users/Tammy/Documents/GitHub/multimodal_stockprice_prediction/data/clean/stock_text_data/Apple_Inc_text_data.csv'
# Load the CSV file with the specified encoding and column names
df = pd.read_csv(file_path,encoding='utf-8')

import re
# Cleaning function
def clean_text(text):
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation and special characters (except spaces)
    text = re.sub(r'[^\w\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [2]:
# Check the first few rows of the data
df.head()

Unnamed: 0,pub_date,abstract,lead_para,headline,doc_type,section_name,type_of_material,rank,web_url
0,2015-04-07T09:17:59+0000,"Want to work at Amazon, Apple or McKinsey? Som...","With some 13,000 graduate schools of business ...",M.B.A. Programs That Get You Where You Want to Go,article,Education,News,7,https://www.nytimes.com/2015/04/12/education/e...
1,2015-04-14T20:46:01+0000,Get recommendations from New York Times report...,Get recommendations from New York Times report...,What We’re Reading,article,Blogs,News,13,https://news.blogs.nytimes.com/2015/04/14/what...
2,2015-04-13T22:00:31+0000,The business unit will partner with companies ...,IBM is taking its Watson artificial-intelligen...,IBM Creates Watson Health to Analyze Medical Data,article,Technology,News,8,https://bits.blogs.nytimes.com/2015/04/13/ibm-...
3,2015-04-22T19:42:26+0000,"With superstars first in line, Apple appears t...","Two weeks ago, Pharrell Williams posted an Ins...",What’s That on Beyoncé’s Wrist? Let Me Guess ....,article,Style,News,1,https://www.nytimes.com/2015/04/23/style/whats...
4,2015-04-01T10:21:53+0000,"In an industry that avoids controversy, the he...",The technology industry’s leaders have found t...,Daily Report: Tech Leaders Come Together to Op...,article,Technology,News,3,https://bits.blogs.nytimes.com/2015/04/01/dail...


In [3]:
#reformat date
# format datetime again
df['pub_date'] = pd.to_datetime(df['pub_date']) 
df['pub_date'] = df['pub_date'].dt.date

In [4]:
#cleaning
df['abstract'] = df['abstract'].apply(clean_text) # change this line for abstract

In [5]:
# Select only the 'pub_date' and 'headline' columns
df_new = df[['pub_date', 'abstract']] #change this line for abstract
  
# Check the new DataFrame
df_new.head()


Unnamed: 0,pub_date,abstract
0,2015-04-07,want to work at amazon apple or mckinsey some ...
1,2015-04-14,get recommendations from new york times report...
2,2015-04-13,the business unit will partner with companies ...
3,2015-04-22,with superstars first in line apple appears to...
4,2015-04-01,in an industry that avoids controversy the hea...


In [6]:
#set up gemini to process textual templates
from google import genai
from dotenv import load_dotenv
import os
load_dotenv()
# Load environment variables from the .env file
API_KEY = os.getenv("API_KEY")
client = genai.Client(api_key=API_KEY)#insert api key

In [7]:
system_instruction = (
    "You are a financial analyst and an expert in detecting sentiment in financial news."
)



## Function to call Gemini API

In [8]:
from google.genai import types
import json
# Function to get sentiment based on model output
def gemini_predict(prompt):
    try:
        # Generate content from the model
        response = client.models.generate_content(
            model='gemini-2.0-flash-lite', 
            contents=prompt,
            config=types.GenerateContentConfig(
            system_instruction=system_instruction,
            max_output_tokens=60,  # Label only
            temperature=0.5,      # More flexibility
            top_k=5,              # Limit to top 5 choices
            top_p=0.7,            # Consider tokens covering 70% probability mass
            response_mime_type='application/json',
            stop_sequences=['}']   # No stop sequence to avoid premature stops
            )
        )
        
        response_text = response.text.strip()
        
        # Attempt to fix incomplete JSON by appending a missing closing brace
        if not response_text.endswith('}'):
            response_text += '}'
        
        # Attempt to parse JSON
        try:
            response_json = json.loads(response_text)  # Parse JSON
            sentiment = response_json.get("Sentiment", "").strip()
            
            if sentiment not in ['Positive', 'Negative', 'Neutral']:
                sentiment = 'Neutral'
            
            return sentiment
        
        except json.JSONDecodeError:
            print(f"JSON Parsing Error. Response: {response_text}")
            return 'Error'
    
    except Exception as e:
        print(f"API Error: {e}")
        return 'Error'

## Experiment with few-shot prompting

In [9]:

def find_sentiment_few_shot(text):
    prompt = f"""
    Classify the sentiment of the financial abstract as 'Positive', 'Negative', or 'Neutral'. 
    Use 'Neutral' only if truly unclear. Be decisive—choose 'Positive' or 'Negative' whenever possible.

    Return the result in JSON format:

    Example:
    {{ "Sentiment": "Positive" }}

    Example 1:
    Abstract: "Company Close to Finalizing Its 40 billion dollar funding." 
    Sentiment: Positive

    Example 2:
    Abstract: "Trump blocks 10% of funds for key agency in US-China Tech Race."
    Sentiment: Negative

    Example 3:
    Abstracte: "Why Company B could be a key to a Company C's Deal."
    Sentiment: Neutral

    Example 4:
    Abstract: "Artificial intelligence boom might help mitigate some tariff pain."
    Sentiment: Neutral

    Example 5:
    Abstract: "Major banks face regulatory hurdles, impacting earnings outlook."
    Sentiment: Negative

    Example 6:
    Abstract: "Company’s $32 billion deal may signal a turning point for slow IPO, M&A markets."
    Sentiment: Positive

    Example 7:
    Abstract: "Company X enters a strategic partnership with Company Y to expand its operations in Asia."
    Sentiment: Positive

    Abstract: "{text}"
    Response (in JSON format):
    {{ "Sentiment":"""
    
    sentiment = gemini_predict(prompt)
    return sentiment


In [10]:
def apply_without_delay(df, sentiment_column):
    sentiment_list = []
    
    for idx, abstract in enumerate(df[sentiment_column]):
        print(f"Processing {idx + 1}/{len(df)}: {abstract}")  # Progress indicator
        
        sentiment = find_sentiment_few_shot(abstract).lower()  # Gemini prediction in lowercase
        sentiment_list.append(sentiment)
    
    # Add Gemini's predictions while keeping original labels intact
    df['gemini_sentiment'] = sentiment_list
    return df


In [11]:
df_new =  apply_without_delay(df_new, 'abstract')
print(df_new[['pub_date', 'abstract', 'gemini_sentiment']])

Processing 1/2143: want to work at amazon apple or mckinsey some business schools have impressive records placing graduates in certain fields and even companies
Processing 2/2143: get recommendations from new york times reporters and editors highlighting interesting stories from around the web in this installment great reads from stacy cowley quentin hardy and others
Processing 3/2143: the business unit will partner with companies including apple medtronic and johnson johnson offering ibms watson technology as a cloudbased tool at many levels of the health care industry
Processing 4/2143: with superstars first in line apple appears to be seeking exclusivity by limiting where its watches are sold and who wears them
Processing 5/2143: in an industry that avoids controversy the heads of several prominent companies including apple and salesforcecom have chosen to pick a battle
Processing 6/2143: spending seven days learning to use and getting used to the apple watch proved to be a mostly r

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['gemini_sentiment'] = sentiment_list


In [13]:
df_new['gemini_sentiment'].value_counts()
## Save results to csv

# df_new.to_csv('gemini_sentiment_analysis_results_AAPL_abstract.csv', index=False)


gemini_sentiment
neutral     1073
negative     766
positive     304
Name: count, dtype: int64