In [1]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install transformers[torch]
!pip install evaluate
!pip install newsapi-python
!pip install langdetect
!pip install pymongo

Looking in indexes: https://download.pytorch.org/whl/cu121


#**Dont Run The Below**


In [None]:
from newsapi import NewsApiClient
import pymongo
from langdetect import detect, LangDetectException


class NewsArticleManager:
    def __init__(self, api_key, mongodb_uri, db_name='stocks_news', collection_name='articles'):
        self.newsapi = NewsApiClient(api_key=api_key)
        self.client = pymongo.MongoClient(mongodb_uri)
        self.db = self.client[db_name]
        self.collection = self.db[collection_name]

    def is_english(self, text):
        try:
            return detect(text) == 'en'
        except LangDetectException:
            return False

    def fetch_and_save_articles(self, keyword, from_date, to_date):
        query = keyword
        articles = self.newsapi.get_everything(q=query, from_param=from_date, to=to_date, language='en',
                                               sort_by='popularity')
        for article in articles['articles']:
            text_to_check = article.get('content') or article.get('description')
            if "[Removed]" not in article.get('title', '') and text_to_check and self.is_english(text_to_check):
                if not self.collection.find_one({'url': article['url']}):
                    self.collection.insert_one(article)

    def cleanup_articles(self):
        articles = self.collection.find()
        for article in articles:
            if "[Removed]" in article.get('title', '') or (
            not self.is_english(article.get('content') or article.get('description', ''))):
                self.collection.delete_one({'_id': article['_id']})

    def cleanup_duplicate_articles(self):
        pipeline = [
            {"$group": {"_id": "$url", "uniqueIds": {"$addToSet": "$_id"}, "count": {"$sum": 1}}},
            {"$match": {"count": {"$gt": 1}}}
        ]
        duplicates = self.collection.aggregate(pipeline)
        for duplicate in duplicates:
            ids_to_remove = duplicate['uniqueIds'][1:]
            for id_to_remove in ids_to_remove:
                self.collection.delete_one({"_id": id_to_remove})

    def print_articles_from_mongodb(self):
        articles = self.collection.find()
        for article in articles:
            print(f"Title: {article['title']}")
            print(f"Description: {article.get('description', 'No description available')}")
            print(f"URL: {article['url']}\n")
            print("--------------------------------------------------\n")


if __name__ == '__main__':
    api_key = 'cbaf7f1f50ab40f6915bcb91db00ae1c'
    mongodb_uri = 'mongodb+srv://bsolimanhanna:K123456789@newsapitwoweeks.y1s7pil.mongodb.net/?retryWrites=true&w=majority&appName=NewsAPITwoWeeks'
    manager = NewsArticleManager(api_key, mongodb_uri)

    keywords = ["AAPL", "MSFT", "GOOGL", "AMZN", "META", "IBM", "NVDA",
                "BIDU", "CRM", "TSLA", "TWLO", "PLTR", "AI", "INTC",
                "QCOM", "AMD", "ORCL", "SAP", "SIEGY", "HON",
                "GE", "MU", "ROBO", "PATH", "ZM", "DOCU", "SQ", "SHOP",
                "SPLK", "TTD", "CRWD", "ZS", "SNOW", "FTNT", "ADSK",
                "ADBE", "ASML", "SNPS", "CDNS", "ANSS", "TER", "KYCCF",
                "OMRNY", "0020.HK", "002230.SZ", "Apple",
                "Microsoft Corporation", "Alphabet","Amazon", "Meta", "International Business Machines Corporation",
                "NVIDIA", "Baidu", "Salesforce", "Tesla","Twilio","Palantir Technologies","C3.ai",  "Intel",
                "Qualcomm", "Advanced Micro Devices", "Oracle", "SAP", "Siemens", "Honeywell International",
                "General Electric","Micron Technology","Exchange Traded Concepts Trust - ROBO Global Robotics and Automation Index ETF",
                "UiPath","Zoom Video Communications", "DocuSign", "Block","Shopify", "Splunk", "The Trade Desk",
                "CrowdStrike Holdings", "Zscaler","Snowflake", "Fortinet", "Autodesk", "Adobe", "ASML Holding",
                "Synopsys", "Cadence Design Systems", "ANSYS", "Teradyne","Keyence","Omron", "Wheelock and Company",
                "iFlytek"]
    for keyword in keywords:
        manager.fetch_and_save_articles(keyword, '2024-03-03', '2024-04-02')

    print("Finished fetching and saving articles. Now cleaning up...")
    manager.cleanup_articles()
    manager.cleanup_duplicate_articles()

    print("Now printing articles from MongoDB:\n")
    manager.print_articles_from_mongodb()


#**MongoDB**

In [2]:
! pip install pymongo dnspython




In [3]:
!pip install "pymongo[srv]"



In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from pymongo import MongoClient
import torch

In [11]:
from os import truncate


def sentiment_analysis(text, tokinizer, model):
  inputs = tokinizer(text, return_tensors = "pt", padding = True, truncation = True, max_length = 512)
  outputs = model(**inputs)
  probs = torch.softmax(outputs.logits, dim = -1)
  return probs

In [13]:
tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
import pymongo
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Connects to the MongoDB server running on
# localhost:27017 by default
client = pymongo.MongoClient("mongodb+srv://bsolimanhanna:K123456789@newsapitwoweeks.y1s7pil.mongodb.net/?retryWrites=true&w=majority&appName=NewsAPITwoWeeks")
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
db = client['stocks_news']
collection = db['articles']
all_articles = collection.find()
df = pd.DataFrame(list(all_articles))

print(f"Dataframe Length: {len(df)}")

probs = []
for idx, row in df.iterrows():
  article_text = row['content']
  article_id = row['_id']

  sentiment_probs = sentiment_analysis(article_text, tokenizer, model)
  # sentiment_score = sentiment_probs.argmax(dim=-1).item()  # Assuming that index 1 corresponds to the positive sentiment class
  sentiment_probs = sentiment_probs.detach().cpu().numpy()
  sentiment_probs = np.squeeze(sentiment_probs, axis=0).tolist()
  collection.update_one({'_id': article_id},
                        {'$set': {'sentiment_probs': sentiment_probs}})
  probs.append(sentiment_probs)

print(len(probs))

Dataframe Length: 5604


In [12]:
def display_sentiment_score(collection):
  print("Sentiment Scores:")
  for article in all_articles:
    print(f"Article ID: {article['_id']} - Sentiment Score: {article.get('sentiment_score', 'Not available')}")

In [None]:
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
df.drop('_id', axis =1, inplace = True)
csv_file_path = "/content/gdrive/MyDrive/NewsAPI.csv"
df.to_csv(csv_file_path, index=False)
print(f"Data Saved to {csv_file_path}")

In [None]:
import yfinance as yf  # Import yfinance module


# Define the list of desired stock symbols
stocks = [
    "AAPL", "MSFT", "GOOGL", "AMZN", "META", "IBM", "NVDA",
    "BIDU", "CRM", "TSLA", "TWLO", "PLTR", "AI", "INTC",
    "QCOM", "AMD", "ORCL", "SAP", "SIEGY", "HON",
    "GE", "MU", "ROBO", "PATH", "ZM", "DOCU", "SQ", "SHOP",
    "SPLK", "CRWD", "ZS", "SNOW", "FTNT", "ADSK",
    "ADBE", "ASML", "SNPS", "CDNS", "ANSS", "TER", "KYCCF",
    "OMRNY", "0020.HK", "002230.SZ"
]

# Define the time period
start_date = "2024-03-03"  # Start date for fetching data
end_date = "2024-04-02"  # End date for fetching data

# Fetch historical data
data = yf.download(stocks, start=start_date, end=end_date)  # Download stock data

# Structure the data for easier analysis
summary_df = data.stack(level=1).reset_index().rename(columns={"level_1": "Ticker"})  # Stack and reset index

# Calculate average closing prices by date and add it as a new column
average_closing_by_date = data['Close'].mean(axis=1)  # Calculate average closing price
summary_df['Avg Closing Price'] = summary_df.index.map(average_closing_by_date)  # Map average closing price to new column

# Define the short and long windows for moving averages
short_window = 12  # Short window for moving average
long_window = 26  # Long window for moving average

# Calculate the short and long window moving averages
summary_df['SMA_12'] = summary_df.groupby('Ticker')['Close'].transform(lambda x: x.rolling(window=short_window).mean())  # Calculate short window moving average
summary_df['SMA_26'] = summary_df.groupby('Ticker')['Close'].transform(lambda x: x.rolling(window=long_window).mean())  # Calculate long window moving average

# Calculate MACD and MACD Signal
summary_df['MACD'] = summary_df['SMA_12'] - summary_df['SMA_26']  # Calculate MACD
summary_df['MACD_Signal'] = summary_df.groupby('Ticker')['MACD'].transform(lambda x: x.rolling(window=9).mean())  # Calculate MACD Signal

# Calculate RSI for each stock
change = summary_df.groupby('Ticker')['Close'].transform(lambda x: x.diff())  # Calculate price change
gain = change.where(change > 0, 0)  # Separate gains
loss = -change.where(change < 0, 0)  # Separate losses
avg_gain = gain.rolling(window=14).mean()  # Calculate average gain
avg_loss = loss.rolling(window=14).mean()  # Calculate average loss
rs = avg_gain / avg_loss  # Calculate relative strength
summary_df['RSI'] = 100 - (100 / (1 + rs))  # Calculate RSI

summary_df['Avg Closing Price'] = summary_df['Date'].map(data['Close'].mean(axis=1))

# Print the data types
print(summary_df.dtypes)  # Print data types of DataFrame

# Print the summary dataframe
print(summary_df)  # Print DataFrame

#Moving Averages: Help identify trends. A rising moving average indicates an uptrend,
# while a falling moving average indicates a downtrend.

#MACD: Used to catch trends early and can also indicate the end of a trend.
# A crossover of the MACD line above the signal line is a bullish signal, while a crossover below is a bearish signal.

#RSI: Identifies overbought or oversold conditions.
# Values over 70 suggest an overbought condition (potentially overvalued),
# and values under 30 suggest an oversold condition (potentially undervalued).

[*********************100%%**********************]  44 of 44 completed


Price
Date                 datetime64[ns]
Ticker                       object
Adj Close                   float64
Close                       float64
High                        float64
Low                         float64
Open                        float64
Volume                      float64
Avg Closing Price           float64
SMA_12                      float64
SMA_26                      float64
MACD                        float64
MACD_Signal                 float64
RSI                         float64
dtype: object
Price       Date     Ticker   Adj Close       Close        High         Low  \
0     2024-03-04    0020.HK    0.890000    0.890000    0.930000    0.890000   
1     2024-03-04  002230.SZ   52.330002   52.330002   53.220001   51.509998   
2     2024-03-04       AAPL  175.100006  175.100006  176.899994  173.789993   
3     2024-03-04       ADBE  567.940002  567.940002  576.250000  564.099976   
4     2024-03-04       ADSK  260.700012  260.700012  263.850006  259.660004   
..

In [None]:
min_required_data_points = max(short_window,long_window, 14)
stocks_counts = summary_df['Ticker'].value_counts()
suffiecient_data_stocks = stocks_counts[stocks_counts >= min_required_data_points].index.tolist()

filtered_df = summary_df[summary_df['Ticker'].isin(suffiecient_data_stocks)]

In [None]:
filtered_df.describe()

Price,Date,Adj Close,Close,High,Low,Open,Volume,Avg Closing Price,SMA_12,SMA_26,MACD,MACD_Signal,RSI
count,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,NaT,,,,,,,,,,,,
min,NaT,,,,,,,,,,,,
25%,NaT,,,,,,,,,,,,
50%,NaT,,,,,,,,,,,,
75%,NaT,,,,,,,,,,,,
max,NaT,,,,,,,,,,,,
std,,,,,,,,,,,,,


In [None]:
summary_df.head(200)

Price,Date,Ticker,Adj Close,Close,High,Low,Open,Volume,Avg Closing Price,SMA_12,SMA_26,MACD,MACD_Signal,RSI
0,2024-03-04,0020.HK,0.890000,0.890000,0.930000,0.890000,0.900000,230878307.0,215.063628,,,,,
1,2024-03-04,002230.SZ,52.330002,52.330002,53.220001,51.509998,52.570000,83211122.0,215.063628,,,,,
2,2024-03-04,AAPL,175.100006,175.100006,176.899994,173.789993,176.149994,81510100.0,215.063628,,,,,
3,2024-03-04,ADBE,567.940002,567.940002,576.250000,564.099976,572.849976,2556400.0,215.063628,,,,,
4,2024-03-04,ADSK,260.700012,260.700012,263.850006,259.660004,263.010010,1945000.0,215.063628,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,2024-03-08,IBM,195.949997,195.949997,197.770004,194.380005,196.059998,3942500.0,213.445599,,,,,8.695127
196,2024-03-08,INTC,44.000000,44.000000,46.630001,44.000000,46.439999,54793800.0,213.445599,,,,,8.874277
197,2024-03-08,KYCCF,484.933990,486.000000,501.000000,482.000000,490.000000,1400.0,213.445599,,,,,8.634497
198,2024-03-08,META,505.950012,505.950012,523.570007,499.350006,514.190002,18575200.0,213.445599,,,,,8.452168


In [None]:
summary_df.describe()

Price,Date,Adj Close,Close,High,Low,Open,Volume,Avg Closing Price,SMA_12,SMA_26,MACD,MACD_Signal,RSI
count,871,871.0,871.0,871.0,871.0,871.0,871.0,871.0,387.0,0.0,0.0,0.0,827.0
mean,2024-03-16 13:35:03.788748544,213.986728,214.004053,217.21696,211.245113,214.532403,23358970.0,214.004053,214.487286,,,,49.381161
min,2024-03-04 00:00:00,0.7,0.7,0.74,0.7,0.7,0.0,48.720001,0.805833,,,,0.0
25%,2024-03-08 00:00:00,67.359997,67.359997,68.233002,66.77,67.690002,1639400.0,211.659332,67.330417,,,,16.007791
50%,2024-03-15 00:00:00,158.139999,158.139999,160.300003,157.210007,158.389999,5221100.0,214.122041,159.787497,,,,41.930849
75%,2024-03-22 00:00:00,297.960129,298.0,301.369995,294.894989,299.289993,22246600.0,216.043947,303.314166,,,,85.960655
max,2024-04-01 00:00:00,1047.390015,1047.390015,1056.339966,1014.820007,1038.880005,528954200.0,220.381481,976.792506,,,,100.0
std,,214.125696,214.141112,217.80782,210.83055,214.680297,51506310.0,6.33006,214.828903,,,,35.286579


In [None]:
summary_df.count()

Price
Date                 871
Ticker               871
Adj Close            871
Close                871
High                 871
Low                  871
Open                 871
Volume               871
Avg Closing Price      0
SMA_12               387
SMA_26                 0
MACD                   0
MACD_Signal            0
RSI                  827
dtype: int64

In [None]:
summary_df.isna().sum()

Price
Date                   0
Ticker                 0
Adj Close              0
Close                  0
High                   0
Low                    0
Open                   0
Volume                 0
Avg Closing Price    871
SMA_12               484
SMA_26               871
MACD                 871
MACD_Signal          871
RSI                   44
dtype: int64

In [None]:
csv_file_path = "/content/gdrive/MyDrive/stocks.csv"
summary_df.to_csv(csv_file_path, index=False)
print(f"Data Saved to {csv_file_path}")

Data Saved to /content/gdrive/MyDrive/stocks.csv
