In [13]:
import yfinance as yf
import pandas as pd
import numpy as np
import jsonlines
import nltk
from datetime import datetime, timedelta
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tqdm import tqdm  # Progress bar for loops

In [None]:
# Download VADER lexicon for sentiment analysis (run once)
nltk.download('vader_lexicon')

# Initialize Sentiment Analyzer
sia = SentimentIntensityAnalyzer()


In [None]:
{
    "text": "User: Here is stock data for AAPL. Last 10 days price changes: [-0.52, 1.21, -0.75, 2.50, -1.02, ...]%. Volume changes: [3.45, -1.23, 0.98, -2.45, 5.67, ...]%.  Recent news sentiment: [{'headline': 'None', 'days_ago': 'None', 'sentiment': 'None'}]. What is the expected price change in 3 days?\n\<think><think>\n\nAssistant: The stock price is expected to change by 2.12% in 3 days."
}


In [1]:
import undetected_chromedriver as uc
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import time
from tqdm import tqdm  # Progress bar for loops
import logging
from typing import Optional, Dict, Any, List

# Log file location
LOG_FILE = "search_log.txt"

# Configure logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Create console handler and set level to info
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)

# Create file handler and set level to info
file_handler = logging.FileHandler(LOG_FILE, encoding='utf-8')
file_handler.setLevel(logging.INFO)

# Create formatter and add it to the handlers
formatter = logging.Formatter('[%(asctime)s] %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
console_handler.setFormatter(formatter)
file_handler.setFormatter(formatter)

# Add the handlers to the logger
logger.addHandler(console_handler)
logger.addHandler(file_handler)

# List of search sources in priority order
SEARCH_SOURCES: List[str] = [
    "finance.yahoo.com",
    "investing.com",
    "bloomberg.com"
]

def generate_bing_search_url(stock_symbol: str, target_date: datetime, search_source: str) -> str:
    """
    Generates a Bing search URL that matches the stock news date format for a specific source.
    
    Args:
        stock_symbol (str): The stock symbol to search for.
        target_date (datetime): The date of the news.
        search_source (str): The news source to search from.
    
    Returns:
        str: A URL string for the Bing search.
    """
    # Format the date in Investing.com/Yahoo/Bloomberg format (e.g., "Jan 23, 2025")
    date_str: str = target_date.strftime('%b %d, %Y')  # Example: "Jan 23, 2025"
    
    # Encode the date for Bing search
    encoded_date: str = f'%22{date_str.replace(" ", "+").replace(",", "%2C")}%22'

    # Construct the search URL
    query: str = f"{stock_symbol} stock news site%3A{search_source} {encoded_date}"
    bing_search_url: str = f"https://www.bing.com/search?q={query}&setlang=en"
    
    return bing_search_url

def log_search_attempt(search_source: str, search_url: str, attempt: int, news_found: bool = False, 
                       news_data: Optional[Dict[str, Any]] = None) -> None:
    """
    Logs search attempts and whether news was found.
    
    Args:
        search_source (str): The news source used in the search.
        search_url (str): The Bing search URL that was queried.
        attempt (int): The current attempt number (0-indexed).
        news_found (bool, optional): Flag indicating if news was found. Defaults to False.
        news_data (Optional[Dict[str, Any]], optional): The data of the found news. Defaults to None.
    """
    timestamp: str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    if news_found and news_data is not None:
        log_message: str = (
            f"[{timestamp}] ✅ News Found on {search_source}:\n"
            f"Title: {news_data.get('title')}\n"
            f"URL: {news_data.get('link')}\n"
        )
    else:
        log_message = (
            f"[{timestamp}] ❌ No news found in Attempt {attempt+1}/3 - {search_source}:\n"
            f"{search_url}\n"
        )

    logger.info(log_message)

def fetch_stock_news(stock_symbol: str, target_date: datetime) -> pd.DataFrame:
    """
    Fetches stock news from Yahoo Finance, Investing.com, and Bloomberg using Bing search.
    Stops at the first successful match and logs results.
    
    Args:
        stock_symbol (str): The stock symbol to search for.
        target_date (datetime): The date for which to fetch the news.
    
    Returns:
        pd.DataFrame: A DataFrame containing the news data, or a row with NaN values if not found.
    """
    # Initialize the undetected ChromeDriver
    options = uc.ChromeOptions()
    options.headless = True  # Required for cloud environments
    driver = uc.Chrome(options=options)

    for attempt, search_source in enumerate(SEARCH_SOURCES):
        # Generate the Bing search URL for the current source
        search_url: str = generate_bing_search_url(stock_symbol, target_date, search_source)
        
        # Navigate to Bing Search
        driver.get(search_url)

        # Wait for search results to load
        time.sleep(5)

        # Parse the page source with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Extract search results from Bing
        for item in soup.find_all('li', class_='b_algo'):
            title_element = item.find('h2')
            link_element = item.find('a', href=True)
            snippet_element = item.find('p', class_='b_lineclamp2')  # Check for date inside <p>
            date_element = item.find('span', class_='news_dt')  # Existing date check

            # Extract title, link, and snippet text
            title: Optional[str] = title_element.get_text() if title_element else None
            snippet: Optional[str] = snippet_element.get_text() if snippet_element else None
            link: Optional[str] = link_element['href'] if link_element else None

            # Double check date (first from <span>, then from <p>)
            extracted_date: Optional[str] = None
            if date_element:
                extracted_date = date_element.text.strip()  # First check span
            elif snippet:  # Second check if date is inside <p>
                words = snippet.split("•")  # Example: "ByReuters • Nov 01, 2024 Earnings call..."
                if len(words) > 1:
                    extracted_date = words[1].strip()  # Extract second part

            # Format the expected date exactly as "Nov 01, 2024"
            expected_date: str = target_date.strftime('%b %d, %Y')

            # Ensure extracted date matches expected date & stock symbol is present
            if extracted_date == expected_date and stock_symbol.upper() in ((title or "") + (snippet or "")):
                news_data: Dict[str, Any] = {
                    'stock_symbol': stock_symbol,
                    'date': target_date.strftime('%Y-%m-%d'),
                    'source': search_source,
                    'title': title,
                    'link': link,
                    'snippet': snippet
                }
                log_search_attempt(search_source, search_url, attempt, news_found=True, news_data=news_data)
                driver.quit()
                return pd.DataFrame([news_data], columns=['stock_symbol', 'date', 'source', 'title', 'link', 'snippet'])

        # If no news found for this source, log it
        log_search_attempt(search_source, search_url, attempt, news_found=False)

    # If no news is found after all sources, return a row with NaN values
    driver.quit()
    return pd.DataFrame([{
        'stock_symbol': stock_symbol,
        'date': target_date.strftime('%Y-%m-%d'),
        'source': None,
        'title': None,
        'link': None,
        'snippet': None
    }])

def collect_stock_news_for_range(stock_symbol: str, start_date: datetime, end_date: datetime) -> pd.DataFrame:
    """
    Loops through a date range to collect stock news for each day in the correct order.
    
    Args:
        stock_symbol (str): The stock symbol to search for.
        start_date (datetime): The start date of the range.
        end_date (datetime): The end date of the range.
    
    Returns:
        pd.DataFrame: A DataFrame containing all collected news data.
    """
    all_news: List[pd.DataFrame] = []
    date_range: List[datetime] = pd.date_range(start=start_date, end=end_date).to_list()

    for single_date in tqdm(sorted(date_range), desc="Collecting News"):
        df_news: pd.DataFrame = fetch_stock_news(stock_symbol, single_date)
        all_news.append(df_news)

    # Combine all collected DataFrames
    final_df: pd.DataFrame = pd.concat(all_news, ignore_index=True)
    return final_df


## Fetching and Saving Single Stock News Data


In [None]:
stock_symbol = "AAPL"
start_date = datetime(2024, 12, 1)  # Start date
end_date = datetime(2025, 2, 1)    # End date

df_all_news = collect_stock_news_for_range(stock_symbol, start_date, end_date)
df_all_news


## Fetching and Saving Multiple Stock News Datasets

In [4]:
TOP_TECH_STOCKS = [
    "AAPL",  # Apple Inc.
    "MSFT",  # Microsoft Corp.
    "NVDA",  # Nvidia Corp.
    "TSLA",  # Tesla Inc.
    "AMD",   # Advanced Micro Devices
    "ADBE",  # Adobe Inc.
    "CRM",   # Salesforce Inc.
    "NOW",   # ServiceNow Inc.
    "INTU",  # Intuit Inc.
    "TXN"    # Texas Instruments Inc.
]


In [None]:
import os
from datetime import datetime
import logging
from typing import List

logger = logging.getLogger(__name__)

# Directory to save CSV files
SAVE_DIR: str = "tech_stock_news"

# Ensure the directory exists
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

def collect_and_save_news_for_tech_stocks(stock_list: List[str], start_date: datetime, end_date: datetime) -> None:
    """
    Fetches news for a list of tech stocks and saves each as a separate CSV file.

    Args:
        stock_list (List[str]): A list of tech stock symbols.
        start_date (datetime): The start date for collecting news.
        end_date (datetime): The end date for collecting news.
    """
    for stock in stock_list:
        logger.info(f"📊 Fetching news for {stock}...")

        # Collect news data for the stock
        df_stock_news = collect_stock_news_for_range(stock, start_date, end_date)

        # Define the CSV file path
        csv_filename: str = os.path.join(SAVE_DIR, f"{stock}_news.csv")

        # Save the DataFrame to CSV
        df_stock_news.to_csv(csv_filename, index=False, encoding="utf-8")

        logger.info(f"✅ {stock} news saved to {csv_filename}")

# Set date range
start_date: datetime = datetime(2023, 1, 1)  # Start date
end_date: datetime = datetime(2025, 2, 1)      # End date

collect_and_save_news_for_tech_stocks(TOP_TECH_STOCKS, start_date, end_date)


## Historical Price Data Preparation


In [35]:
import yfinance as yf
import pandas as pd
from typing import Any

class StockDataPipeline:
    def __init__(self, symbol: str, start_date: str, end_date: str):
        """
        Initialize the pipeline with the stock symbol and date range.

        Args:
            symbol (str): Stock symbol (e.g., 'AAPL').
            start_date (str): Start date in 'YYYY-MM-DD' format.
            end_date (str): End date in 'YYYY-MM-DD' format.
        """
        self.symbol = symbol
        self.start_date = start_date
        self.end_date = end_date
        self.data: pd.DataFrame = pd.DataFrame()

    def fetch_daily_stock_data(self) -> pd.DataFrame:
        """
        Fetch daily stock data (High, Low, Close, Volume) from yfinance.

        Returns:
            pd.DataFrame: DataFrame containing the stock data.
        """
        ticker: Any = yf.Ticker(self.symbol)
        history: pd.DataFrame = ticker.history(start=self.start_date, end=self.end_date)
        self.data = history[['High', 'Low', 'Close', 'Volume']].copy()
        return self.data

    def add_daily_percentage_change(self, price_col: str = "Close", new_col: str = "Pct_Change") -> pd.DataFrame:
        """
        Add a column for daily percentage change based on the specified price column.

        Args:
            price_col (str): Column name for prices (default 'Close').
            new_col (str): Name for the new percentage change column (default 'Pct_Change').

        Returns:
            pd.DataFrame: DataFrame with the new percentage change column.
        """
        self.data[new_col] = self.data[price_col].pct_change() * 100
        return self.data

    def add_volume_change(self, volume_col: str = "Volume", new_col: str = "Volume_Change") -> pd.DataFrame:
        """
        Add a column for daily volume percentage change based on the specified volume column.

        Args:
            volume_col (str): Column name for volume (default 'Volume').
            new_col (str): Name for the new volume change column (default 'Volume_Change').

        Returns:
            pd.DataFrame: DataFrame with the new volume change column.
        """
        self.data[new_col] = self.data[volume_col].pct_change() * 100
        return self.data

    def calculate_rsi(self, price_col: str = "Close", window: int = 14, new_col: str = "RSI") -> pd.DataFrame:
        """
        Calculate the Relative Strength Index (RSI) for the given price column.

        Args:
            price_col (str): Column name for prices (default 'Close').
            window (int): Look-back period for RSI (default 14).
            new_col (str): Name for the RSI column (default 'RSI').

        Returns:
            pd.DataFrame: DataFrame with the RSI column added.
        """
        delta = self.data[price_col].diff()
        gain = delta.clip(lower=0)
        loss = -delta.clip(upper=0)
        avg_gain = gain.rolling(window=window, min_periods=window).mean()
        avg_loss = loss.rolling(window=window, min_periods=window).mean()
        rs = avg_gain / avg_loss
        rsi = 100 - (100 / (1 + rs))
        self.data[new_col] = rsi
        return self.data

    def calculate_mfi(self, window: int = 14, new_col: str = "MFI") -> pd.DataFrame:
        """
        Calculate the Money Flow Index (MFI) for the stock data.

        Args:
            window (int): Look-back period for the MFI (default 14).
            new_col (str): Name for the MFI column (default 'MFI').

        Returns:
            pd.DataFrame: DataFrame with the MFI column added.
        """
        # Calculate the Typical Price
        typical_price = (self.data['High'] + self.data['Low'] + self.data['Close']) / 3.0

        # Calculate Raw Money Flow
        raw_money_flow = typical_price * self.data['Volume']

        # Compute change in Typical Price
        tp_diff = typical_price.diff()

        # Separate positive and negative money flows
        positive_flow = raw_money_flow.where(tp_diff > 0, 0)
        negative_flow = raw_money_flow.where(tp_diff < 0, 0)

        # Rolling sums over the specified window
        pos_mf = positive_flow.rolling(window=window, min_periods=window).sum()
        neg_mf = negative_flow.rolling(window=window, min_periods=window).sum()

        # Money Flow Ratio and MFI calculation
        mfr = pos_mf / neg_mf
        mfi = 100 - (100 / (1 + mfr))

        # Add the MFI column to the DataFrame
        self.data[new_col] = mfi
        return self.data

    def run_pipeline(self) -> pd.DataFrame:
        """
        Run the complete data preparation pipeline:
          1. Fetch daily stock data.
          2. Add daily percentage change for the price.
          3. Add daily volume percentage change.
          4. Calculate RSI.
          5. Calculate MFI.

        Returns:
            pd.DataFrame: The final DataFrame with all computed metrics.
        """
        self.fetch_daily_stock_data()
        self.add_daily_percentage_change()
        self.add_volume_change()
        self.calculate_rsi()
        self.calculate_mfi()
        return self.data


pipeline = StockDataPipeline(symbol="AAPL", start_date="2022-12-01", end_date="2025-02-01")
prepared_data = pipeline.run_pipeline()


In [36]:
prepared_data.dropna(inplace=True)

In [48]:
processed_stock_data = prepared_data.copy()

In [49]:
processed_stock_data.head()

Unnamed: 0_level_0,High,Low,Close,Volume,Pct_Change,Volume_Change,RSI,MFI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-12-21 00:00:00-05:00,135.386284,131.368536,134.040436,85928000,2.380951,10.971061,27.97944,40.313135
2022-12-22 00:00:00-05:00,133.159713,128.94405,130.853958,77852100,-2.377251,-9.39845,25.595225,40.146233
2022-12-23 00:00:00-05:00,131.041978,128.290909,130.487808,63814900,-0.279816,-18.030599,26.261642,34.275576
2022-12-27 00:00:00-05:00,130.042493,127.380484,128.676849,69007800,-1.387838,8.137441,27.96026,34.350002
2022-12-28 00:00:00-05:00,129.666432,124.560134,124.728363,85438400,-3.068529,23.809772,26.152372,34.128042


## Merging Price Data with News Data


In [24]:
news_df = pd.read_csv("tech_stock_news/AAPL_news.csv")

In [28]:
news_df['date'] = pd.to_datetime(news_df['date'], format='%Y-%m-%d')
news_df.set_index('date', inplace=True)

In [46]:
news_df.head()

Unnamed: 0_level_0,stock_symbol,source,title,link,snippet
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-01,AAPL,,,,
2023-01-02,AAPL,,,,
2023-01-03,AAPL,,,,
2023-01-04,AAPL,,,,
2023-01-05,AAPL,,,,


In [50]:
processed_stock_data.index = processed_stock_data.index.strftime('%Y-%m-%d')
processed_stock_data

Unnamed: 0_level_0,High,Low,Close,Volume,Pct_Change,Volume_Change,RSI,MFI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-12-21,135.386284,131.368536,134.040436,85928000,2.380951,10.971061,27.979440,40.313135
2022-12-22,133.159713,128.944050,130.853958,77852100,-2.377251,-9.398450,25.595225,40.146233
2022-12-23,131.041978,128.290909,130.487808,63814900,-0.279816,-18.030599,26.261642,34.275576
2022-12-27,130.042493,127.380484,128.676849,69007800,-1.387838,8.137441,27.960260,34.350002
2022-12-28,129.666432,124.560134,124.728363,85438400,-3.068529,23.809772,26.152372,34.128042
...,...,...,...,...,...,...,...,...
2025-01-27,232.149994,223.979996,229.860001,94863400,3.178024,73.431521,35.613806,41.510611
2025-01-28,240.190002,230.809998,238.259995,75707600,3.654396,-20.193035,43.722049,43.539289
2025-01-29,239.860001,234.009995,239.360001,45486100,0.461683,-39.918714,47.259082,48.792798
2025-01-30,240.789993,237.210007,237.589996,55658300,-0.739474,22.363315,45.203679,54.340699


In [None]:
processed_stock_data.index = pd.to_datetime(processed_stock_data.index)
news_df.index = pd.to_datetime(news_df.index)

merged_stock_data = processed_stock_data.merge(news_df, left_index=True, right_index=True, how='left')

merged_stock_data["3D_Future_Change"] = (
    (merged_stock_data["Close"].shift(-3) - merged_stock_data["Close"]) / merged_stock_data["Close"]
) * 100

merged_stock_data


Unnamed: 0_level_0,High,Low,Close,Volume,Pct_Change,Volume_Change,RSI,MFI,stock_symbol,source,title,link,snippet
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2022-12-21,135.386284,131.368536,134.040436,85928000,2.380951,10.971061,27.979440,40.313135,,,,,
2022-12-22,133.159713,128.944050,130.853958,77852100,-2.377251,-9.398450,25.595225,40.146233,,,,,
2022-12-23,131.041978,128.290909,130.487808,63814900,-0.279816,-18.030599,26.261642,34.275576,,,,,
2022-12-27,130.042493,127.380484,128.676849,69007800,-1.387838,8.137441,27.960260,34.350002,,,,,
2022-12-28,129.666432,124.560134,124.728363,85438400,-3.068529,23.809772,26.152372,34.128042,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-01-27,232.149994,223.979996,229.860001,94863400,3.178024,73.431521,35.613806,41.510611,AAPL,,,,
2025-01-28,240.190002,230.809998,238.259995,75707600,3.654396,-20.193035,43.722049,43.539289,AAPL,,,,
2025-01-29,239.860001,234.009995,239.360001,45486100,0.461683,-39.918714,47.259082,48.792798,AAPL,,,,
2025-01-30,240.789993,237.210007,237.589996,55658300,-0.739474,22.363315,45.203679,54.340699,AAPL,,,,


In [59]:
merged_stock_data = merged_stock_data.dropna(subset=['Close'])

merged_stock_data = merged_stock_data.dropna(subset=['stock_symbol'])

merged_stock_data = merged_stock_data.dropna(subset=['3D_Future_Change'])

merged_stock_data.head()

Unnamed: 0_level_0,High,Low,Close,Volume,Pct_Change,Volume_Change,RSI,MFI,stock_symbol,source,title,link,snippet,3D_Future_Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2023-01-03,129.537788,122.877827,123.768463,112117500,-3.740471,45.542499,22.617052,21.197288,AAPL,,,,,3.637942
2023-01-04,127.321112,123.778365,125.045044,89113600,1.031427,-20.517671,23.287665,13.093595,AAPL,,,,,2.999346
2023-01-05,126.440369,123.461698,123.718987,80962700,-1.060464,-9.14664,23.902441,13.225917,AAPL,,,,,4.567259
2023-01-06,128.934113,123.590315,128.271088,87754700,3.679388,8.389048,39.492949,20.491803,AAPL,,,,,2.985651
2023-01-09,132.021662,128.538289,128.795578,70790800,0.408892,-19.331044,43.030669,28.538724,AAPL,,,,,2.504814


## Preparing Data for Fine-Tuning DeepSeek-R1 Model 


In [60]:
import pandas as pd

# Function to generate "think" reasoning using last 10 days of stock data & news
def generate_think_section(index, df):
    # Get the last 10 rows (including the current one)
    last_10_rows = df.iloc[max(0, index - 9): index + 1]  # Ensures we don't go out of bounds

    # Compute price trend (mean of last 10 days Pct_Change)
    price_trend = "upward" if last_10_rows["Pct_Change"].mean() > 0 else "downward"

    # Compute volume trend (mean of last 10 days Volume_Change)
    volume_trend = "increasing" if last_10_rows["Volume_Change"].mean() > 0 else "decreasing"

    # Get the most recent RSI & MFI values
    latest_rsi = last_10_rows["RSI"].iloc[-1]
    latest_mfi = last_10_rows["MFI"].iloc[-1]

    # Interpret RSI
    if latest_rsi > 70:
        rsi_interpretation = "overbought"
    elif latest_rsi < 30:
        rsi_interpretation = "oversold"
    else:
        rsi_interpretation = "neutral"

    # Interpret MFI
    if latest_mfi > 80:
        mfi_interpretation = "overbought"
    elif latest_mfi < 20:
        mfi_interpretation = "oversold"
    else:
        mfi_interpretation = "neutral"

    # Extract stock symbol
    stock_symbol = df.iloc[index]["stock_symbol"]

    # Collect last 10 days' news (excluding NaN values)
    news_list = last_10_rows.dropna(subset=["title", "snippet"])[["title", "snippet"]].values.tolist()
    
    # Generate news summary if available
    news_summary = ""
    if news_list:
        news_summary = "Recent news headlines: " + "; ".join([f"'{title}' - {snippet}" for title, snippet in news_list]) + ". "

    # Create the "think" reasoning
    think_section = f"<think>Analyzing {stock_symbol}: Over the last 10 days, the price trend has been {price_trend}. "
    think_section += f"Volume has been {volume_trend}. RSI is {latest_rsi:.2f}, indicating {rsi_interpretation} conditions. "
    think_section += f"MFI is {latest_mfi:.2f}, suggesting {mfi_interpretation} market behavior. "
    
    # Add news summary if available
    if news_summary:
        think_section += news_summary

    think_section += "Considering these factors, my prediction is: </think>"

    return think_section

# Apply function to each row using the last 10 days of data
merged_stock_data["think_section"] = [
    generate_think_section(i, merged_stock_data) for i in range(len(merged_stock_data))
]


In [62]:
merged_stock_data.head()

Unnamed: 0_level_0,High,Low,Close,Volume,Pct_Change,Volume_Change,RSI,MFI,stock_symbol,source,title,link,snippet,3D_Future_Change,think_section
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2023-01-03,129.537788,122.877827,123.768463,112117500,-3.740471,45.542499,22.617052,21.197288,AAPL,,,,,3.637942,"<think>Analyzing AAPL: Over the last 10 days, ..."
2023-01-04,127.321112,123.778365,125.045044,89113600,1.031427,-20.517671,23.287665,13.093595,AAPL,,,,,2.999346,"<think>Analyzing AAPL: Over the last 10 days, ..."
2023-01-05,126.440369,123.461698,123.718987,80962700,-1.060464,-9.14664,23.902441,13.225917,AAPL,,,,,4.567259,"<think>Analyzing AAPL: Over the last 10 days, ..."
2023-01-06,128.934113,123.590315,128.271088,87754700,3.679388,8.389048,39.492949,20.491803,AAPL,,,,,2.985651,"<think>Analyzing AAPL: Over the last 10 days, ..."
2023-01-09,132.021662,128.538289,128.795578,70790800,0.408892,-19.331044,43.030669,28.538724,AAPL,,,,,2.504814,"<think>Analyzing AAPL: Over the last 10 days, ..."


In [64]:
import json
import pandas as pd

jsonl_data = []

# We want to use the last 10 rows of context for each entry,
# so we start from index 9 (to have 10 rows available) and loop through the DataFrame.
for i in range(9, len(merged_stock_data)):
    # Get a sliding window of the last 10 rows (including the current row)
    window = merged_stock_data.iloc[i-9:i+1]
    
    # Extract the required lists from the window and round numeric values for readability
    price_changes = window["Pct_Change"].round(2).tolist()
    volume_changes = window["Volume_Change"].round(2).tolist()
    rsi_values = window["RSI"].round(2).tolist()
    mfi_values = window["MFI"].round(2).tolist()
    
    # Current row values
    current_row = merged_stock_data.iloc[i]
    stock_symbol = current_row["stock_symbol"]
    think_section = current_row["think_section"]
    future_change = current_row["3D_Future_Change"]
    
    # Calculate recent news from the last 10 rows
    news_list = []
    current_date = merged_stock_data.index[i]
    # Loop over the window to collect news with non-missing title and snippet
    for j in range(i-9, i+1):
        row = merged_stock_data.iloc[j]
        if pd.notna(row["title"]) and pd.notna(row["snippet"]):
            # Calculate days ago relative to the current row date
            days_ago = (current_date - merged_stock_data.index[j]).days
            news_list.append({
                "headline": row["title"],
                "days_ago": days_ago,
                "snippet": row["snippet"]  # Using snippet instead of sentiment
            })
    
    # Construct the prompt text for the JSONL entry
    user_text = (
        f"User: Here is stock data for {stock_symbol}. "
        f"Last 10 days price changes: {price_changes}%. "
        f"Volume changes: {volume_changes}%. "
        f"RSI values: {rsi_values}. "
        f"MFI values: {mfi_values}. "
        f"Recent news: {news_list}. "
        "What is the expected price change in 3 days?\n\n"
    )
    
    assistant_text = (
        f"Assistant: {think_section} "
        f"The stock price is expected to change by {future_change:.2f}% in 3 days."
    )
    
    full_text = user_text + assistant_text

    # Append the record as a dictionary with a single key "text"
    jsonl_data.append({"text": full_text})

# Write the JSONL records to a file
output_filename = "stock_data_finetune.jsonl"
with open(output_filename, "w") as f:
    for entry in jsonl_data:
        f.write(json.dumps(entry) + "\n")

print(f"JSONL file '{output_filename}' created with {len(jsonl_data)} entries.")


JSONL file 'stock_data_finetune.jsonl' created with 510 entries.


## Train Test Split

In [65]:
import json

# Filenames
input_filename = "stock_data_finetune.jsonl"
train_filename = "stock_data_train.jsonl"
test_filename = "stock_data_test.jsonl"

# Read all lines from the JSONL file
with open(input_filename, "r") as infile:
    lines = infile.readlines()

# Calculate split index (first 90% for training, last 10% for testing)
total_entries = len(lines)
split_index = int(total_entries * 0.9)

train_lines = lines[:split_index]
test_lines = lines[split_index:]

# Save training data to train_filename
with open(train_filename, "w") as train_file:
    train_file.writelines(train_lines)

# Save testing data to test_filename
with open(test_filename, "w") as test_file:
    test_file.writelines(test_lines)

print(f"JSONL file split completed:")
print(f"  Total entries: {total_entries}")
print(f"  Train entries: {len(train_lines)}")
print(f"  Test entries: {len(test_lines)}")


JSONL file split completed:
  Total entries: 510
  Train entries: 459
  Test entries: 51
