<a href="https://colab.research.google.com/github/xanderrp2/StockAI/blob/main/DataCollection_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [233]:
import requests
import numpy as np
from datetime import datetime
from datetime import datetime, timedelta
import pandas as pd
import yfinance as yf
from ast import literal_eval


In [234]:
# Replace with your News API key

BASE_URL = 'https://newsapi.org/v2/everything'
WordListLength = 10000
yesturday = '2025-01-13'

# Define the function to fetch today's news headlines
def fetch_news(query='*', language='en', page_size=10,day=datetime.now().strftime('%Y-%m-%d')):
    params = {
        'q': query,
        'language': language,
        'from': day,
        'to': day,
        'sortBy': 'publishedAt',
        'pageSize': page_size,
        'apiKey': API_KEY,
    }
    response = requests.get(BASE_URL, params=params)
    if response.status_code == 200:
        return response.json().get('articles', [])
    else:
        raise Exception(f"Error fetching news: {response.status_code}, {response.text}")

def vectorize_text(text, word_vectors):
    # Assuming all word vectors have the same dimension
    vector = np.zeros(WordListLength)
    words = text.lower().split()
    for word in words:
        if word in word_vectors:
          vector[word_vectors.index(word)] += 1
    return vector



# Organize news into lists (no tensors for strings)
def organize_news(articles):
    data = {
        "sources": [],
        "headlines": [],
        "contents": []
    }

    for article in articles:
        data["sources"].append(article.get('source', {}).get('name', 'Unknown'))
        data["headlines"].append(article.get('title', 'No Title'))
        data["contents"].append(article.get('content', 'No Content'))

    return data


In [235]:
def file_to_list(file_path):
    """
    Reads a file and creates a dictionary with each line as a key and 0 as its value.

    Args:
        file_path (str): Path to the text file.

    Returns:
        dict: Dictionary with lines from the file as keys and 0 as their values.
    """
    try:
        with open(file_path, 'r') as file:
            lines = file.read().splitlines()  # Read all lines and strip newlines
        return [line.rstrip('\\').rstrip("  ") for line in lines if line.strip()]  # Ignore empty lines
    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return {}
    except Exception as e:
        print(f"Error: {e}")
        return {}

englishPath = '/content/drive/My Drive/Allwords.txt'
companyPath = '/content/drive/My Drive/companyTickers.txt'
words = file_to_list(englishPath)
companies = file_to_list(companyPath)

In [236]:
def vectorizeNews(data):
  # Vectorize headlines
  for i in range(len(data["headlines"])):
    data["headlines"][i] = vectorize_text(data["headlines"][i], words)

  # Vectorize contents
  for i in range(len(data["contents"])):
    data["contents"][i] = vectorize_text(data["contents"][i], words)

  return data

In [237]:
def getNews(company,day):
    # Fetch news articles
    articles = fetch_news(query=company,day=day)

    # Organize into lists
    news_data = organize_news(articles)

    # Vectorize data into 10000d vectors
    vectorizedNews = vectorizeNews(news_data)

    return [company,
            np.sum(vectorizedNews["headlines"], axis=0),
            np.sum(vectorizedNews["contents"], axis=0)]

getNews('NVDIA','2025-01-13')

['NVDIA',
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([2., 0., 2., ..., 0., 0., 0.])]

In [238]:
def get_percentage_change(ticker):
    # Get today's and yesterday's date
    today = datetime.today()
    yesterday = today - timedelta(days=1)

    # Format dates as 'YYYY-MM-DD' for yfinance
    today_str = today.strftime('%Y-%m-%d')
    yesterday_str = yesterday.strftime('%Y-%m-%d')

    # Fetch historical data for yesterday
    stock_data = yf.download(ticker, start=yesterday_str, end=today_str)

    # Ensure we have the data for yesterday
    if stock_data.empty:
        print(f"Not enough data for {ticker}.")
        return None

    # Get the opening and closing prices for yesterday
    yesterday_open = stock_data['Open'].iloc[0]
    yesterday_close = stock_data['Close'].iloc[0]

    # Calculate percentage change
    percentage_change = ((yesterday_close - yesterday_open) / yesterday_open) * 100

    return percentage_change.iloc[0]

# Example usage
ticker = "AAPL"  # Replace with any stock ticker symbol
percentage_change = get_percentage_change(ticker)
print(percentage_change)

[*********************100%***********************]  1 of 1 completed

0.3725410532844261





In [245]:
companyPath = '/content/drive/My Drive/companyTickers.txt'
companies = [yf.Ticker(name) for name in file_to_list(companyPath)]
newsData = []
for ticker in companies:
  name = (ticker.info['shortName']).rstrip("Corporation").rstrip("Inc.").replace(",","")
  newsData.append(getNews(name,yesturday))


In [246]:
companies = [name for name in file_to_list(companyPath)] # Changed this line
for i in range(len(newsData)):
  newsData[i].append(get_percentage_change(companies[i]))

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [247]:
newsFrame = pd.DataFrame(newsData, columns=['Company','headlines','contents','percent_change'])
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
newsFrame['headlines'] = newsFrame['headlines'].apply(lambda x: x.tolist())
newsFrame['contents'] = newsFrame['contents'].apply(lambda x: x.tolist())
newsFrame.to_csv('/content/drive/My Drive/stockNews.csv', index=False)
newsFrame

Unnamed: 0,Company,headlines,contents,percent_change
0,Apple,"[3.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 0.0, 1.0, ...","[15.0, 3.0, 5.0, 9.0, 7.0, 4.0, 3.0, 2.0, 1.0,...",0.372541
1,Microsoft,"[1.0, 0.0, 2.0, 3.0, 0.0, 1.0, 1.0, 0.0, 0.0, ...","[5.0, 6.0, 8.0, 9.0, 17.0, 1.0, 0.0, 4.0, 0.0,...",0.469611
2,NVIDIA,"[1.0, 2.0, 0.0, 2.0, 2.0, 0.0, 0.0, 0.0, 3.0, ...","[13.0, 8.0, 6.0, 4.0, 9.0, 7.0, 3.0, 1.0, 6.0,...",2.492492
3,Alphabet,"[3.0, 3.0, 1.0, 3.0, 2.0, 1.0, 1.0, 0.0, 1.0, ...","[19.0, 6.0, 4.0, 10.0, 11.0, 3.0, 1.0, 0.0, 5....",0.494548
4,Amazon.com,"[1.0, 0.0, 2.0, 2.0, 0.0, 1.0, 2.0, 0.0, 0.0, ...","[10.0, 1.0, 12.0, 18.0, 5.0, 2.0, 2.0, 1.0, 1....",0.18344
5,Meta Platforms,"[3.0, 2.0, 1.0, 3.0, 2.0, 1.0, 1.0, 1.0, 0.0, ...","[16.0, 5.0, 7.0, 7.0, 7.0, 3.0, 3.0, 4.0, 5.0,...",0.202609
6,Tesla,"[3.0, 4.0, 2.0, 3.0, 1.0, 2.0, 0.0, 2.0, 1.0, ...","[8.0, 13.0, 5.0, 8.0, 14.0, 7.0, 4.0, 9.0, 6.0...",5.245168
7,Broadcom,"[2.0, 0.0, 4.0, 3.0, 0.0, 0.0, 1.0, 0.0, 3.0, ...","[16.0, 5.0, 3.0, 5.0, 5.0, 8.0, 3.0, 1.0, 6.0,...",2.409195
8,Oracle,"[4.0, 0.0, 4.0, 5.0, 1.0, 2.0, 2.0, 1.0, 1.0, ...","[21.0, 9.0, 5.0, 9.0, 11.0, 8.0, 2.0, 2.0, 2.0...",0.117076
9,Advanced Micro Devices,"[2.0, 0.0, 3.0, 3.0, 1.0, 1.0, 0.0, 0.0, 6.0, ...","[17.0, 2.0, 5.0, 10.0, 4.0, 4.0, 4.0, 1.0, 8.0...",1.769605


In [252]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate
from sklearn.model_selection import train_test_split

headlines = newsFrame['headlines'].values.tolist()
contents = newsFrame['contents'].values.tolist()
percent_changes = newsFrame['percent_change'].values.tolist()

# Assuming corrected data is loaded in variables: headlines, contents, percent_changes
# Parse the data into numpy arrays (if not already done)
X_headlines = np.array(headlines)
X_contents = np.array(contents)
y = np.array(percent_changes)

# Split the data into training and test sets
X_headlines_train, X_headlines_test, X_contents_train, X_contents_test, y_train, y_test = train_test_split(
    X_headlines, X_contents, y, test_size=0.2, random_state=42
)

# Define the model architecture
# Input for headlines
input_headlines = Input(shape=(X_headlines.shape[1],), name="headlines_input")
headlines_dense = Dense(64, activation="relu")(input_headlines)

# Input for contents
input_contents = Input(shape=(X_contents.shape[1],), name="contents_input")
contents_dense = Dense(64, activation="relu")(input_contents)

# Combine the two inputs
merged = Concatenate()([headlines_dense, contents_dense])
combined_dense = Dense(64, activation="relu")(merged)
out = Dense(1, activation="linear", name="output_layer")(combined_dense)

# Define the model
model = Model(inputs=[input_headlines, input_contents], outputs=out)

# Compile the model
model.compile(optimizer="adam", loss="mse", metrics=["mae"])

# Train the model
history = model.fit(
    [X_headlines_train, X_contents_train], y_train,
    validation_split=0.2,
    epochs=8,
    batch_size=32,
    verbose=1
)

# Evaluate the model on the test set
test_loss, test_mae = model.evaluate([X_headlines_test, X_contents_test], y_test, verbose=0)

# Print results
print(f"Test Loss: {test_loss}, Test MAE: {test_mae}")


Epoch 1/8
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 1.9282 - mae: 1.1636 - val_loss: 6.1682 - val_mae: 1.6868
Epoch 2/8
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step - loss: 0.9342 - mae: 0.6936 - val_loss: 4.9206 - val_mae: 1.4573
Epoch 3/8
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step - loss: 0.6421 - mae: 0.6184 - val_loss: 4.3678 - val_mae: 1.4556
Epoch 4/8
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - loss: 0.5342 - mae: 0.6067 - val_loss: 4.1986 - val_mae: 1.4606
Epoch 5/8
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - loss: 0.4293 - mae: 0.5518 - val_loss: 4.1959 - val_mae: 1.4615
Epoch 6/8
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step - loss: 0.3270 - mae: 0.4373 - val_loss: 4.2704 - val_mae: 1.4560
Epoch 7/8
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - loss: 0.2510 - mae: 0.3061 -

In [261]:
# prompt: use the getnews function to collect news on TSLA for today. then use the model we just trained to predict todays change

# Assuming the model and necessary functions from the previous code are defined.
# Assuming 'model' is the trained Keras model.
# Assuming 'getNews' and 'vectorize_text' are defined.


def predict_todays_change(company, day):
    """Predicts today's stock price change for a given company using the trained model."""

    # Get the news data for the given company and date.
    news_data = getNews(company, day)

    if news_data is None:
        return None  # Handle cases where news data couldn't be retrieved.

    newsFrame = pd.DataFrame(newsData, columns=['Company','headlines','contents','percent_change'])

    headlines = newsFrame['headlines'].values.tolist()
    contents = newsFrame['contents'].values.tolist()
    percent_changes = newsFrame['percent_change'].values.tolist()

    # Assuming corrected data is loaded in variables: headlines, contents, percent_changes
    # Parse the data into numpy arrays (if not already done)
    headlines_input = np.array(headlines)
    contents_input = np.array(contents)

    # Make the prediction
    predicted_change = model.predict([headlines_input, contents_input])

    return predicted_change[0][0]


# Example usage (replace 'TSLA' with the actual company ticker symbol):
company = "TSLA"
today_date = datetime.now().strftime("%Y-%m-%d") # Get today's date

predicted_change = predict_todays_change(company, today_date)

if predicted_change is not None:
    print(f"Predicted percentage change for {company} on {today_date}: {predicted_change:.2f}%")
else:
    print(f"Could not predict the price change for {company} on {today_date}.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Predicted percentage change for TSLA on 2025-01-14: 0.46%
