<a href="https://colab.research.google.com/github/xanderrp2/StockAI/blob/main/DataCollection_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import numpy as np
from collections import Counter
from datetime import datetime

# Replace with your News API key
API_KEY = '---newskey---'
BASE_URL = 'https://newsapi.org/v2/everything'
WordListLength = 10000

# Define the function to fetch today's news headlines
def fetch_news(query='*', language='en', page_size=10):
    today = datetime.now().strftime('%Y-%m-%d')
    params = {
        'q': query,
        'language': language,
        'from': today,
        'to': today,
        'sortBy': 'publishedAt',
        'pageSize': page_size,
        'apiKey': API_KEY,
    }
    response = requests.get(BASE_URL, params=params)
    if response.status_code == 200:
        return response.json().get('articles', [])
    else:
        raise Exception(f"Error fetching news: {response.status_code}, {response.text}")

def vectorize_text(text, word_vectors):
    vector = np.zeros(WordListLength)
    words = text.lower().split()
    for word in words:
        if word in word_vectors:
          vector += word_vectors[word]
    return vector


# Organize news into lists (no tensors for strings)
def organize_news(articles):
    data = {
        "sources": [],
        "headlines": [],
        "dates": [],
        "contents": []
    }

    for articles in article:
        data["sources"].append(articles.get('source', {}).get('name', 'Unknown'))
        data["headlines"].append(articles.get('title', 'No Title'))
        data["dates"].append(articles.get('publishedAt', 'No Date'))
        data["contents"].append(articles.get('content', 'No Content'))

    return data


In [None]:
def file_to_dict(file_path):
    """
    Reads a file and creates a dictionary with each line as a key and 0 as its value.

    Args:
        file_path (str): Path to the text file.

    Returns:
        dict: Dictionary with lines from the file as keys and 0 as their values.
    """
    try:
        with open(file_path, 'r') as file:
            lines = file.read().splitlines()  # Read all lines and strip newlines
        return {line.rstrip('\\'): 0 for line in lines if line.strip()}  # Ignore empty lines
    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return {}
    except Exception as e:
        print(f"Error: {e}")
        return {}

file_path = 'Allwords.txt'
result_dict = file_to_dict(file_path)

In [None]:
def vectorizeNews(data, words):
  # Vectorize headlines
  for i in range(len(data["headlines"])):
    data["headlines"][i] = vectorize_text(data["headlines"][i], words)

  # Vectorize contents
  for i in range(len(data["contents"])):
    data["contents"][i] = vectorize_text(data["contents"][i], words)

  return data

In [None]:
if __name__ == "__main__":
    try:
        # Fetch news articles
        articles = fetch_news(query='technology')

        # Organize into lists
        news_data = organize_news(articles)

        # Vectorize data into 10000d vectors
        news_data = vectorizeNews(news_data, result_dict)

        # Print organized data
        print("Sources:", news_data["sources"])
        print("Headlines:", news_data["headlines"])
        print("Dates:", news_data["dates"])
        print("Contents:", news_data["contents"])


    except Exception as e:
        print("An error occurred:", e)