<a href="https://colab.research.google.com/github/vikrammitra/prometheus/blob/main/AlgoTradingTest01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sourcing Alpaca Credentials

In [1]:
credentials = open("/content/drive/MyDrive/Colab Notebooks/alpaca.txt","r")
lines = credentials.readlines()
consumer_key = lines[0].rstrip().replace("alpaca_key=","")
consumer_secret = lines[1].rstrip().replace("alpaca_secret=","")
#print(consumer_secret)


In [2]:
!pip install alpaca-trade-api requests beautifulsoup4 transformers

Collecting alpaca-trade-api
  Downloading alpaca_trade_api-3.0.2-py3-none-any.whl (34 kB)
Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
Collecting websockets<11,>=9.0 (from alpaca-trade-api)
  Downloading websockets-10.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.8/106.8 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting msgpack==1.0.3 (from alpaca-trade-api)
  Downloading msgpack-1.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (323 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.7/323.7 kB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp==3.8.2 (from alpaca-trade-api)
  Downloading aiohttp-3.8.2-cp310-cp310-manylinux_2_17_x86_64.man

In [3]:
import pandas as pd
from alpaca_trade_api import REST, Stream, TimeFrame,TimeFrameUnit
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from datetime import timedelta

API_KEY = consumer_key
API_SECRET = consumer_secret

def fetch_news_summary(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        summary = soup.find('p').text
        return summary.strip()
    except Exception as e:
        print(f"Error fetching news summary: {e}")
        return None

def get_news_data(ALPACA_API_KEY,ALPACA_SECRET_KEY,ticker):
    # Get News for last 24 hrs
    today = datetime.today() - timedelta(1)
    yesterday = today - timedelta(15)
    today = today.strftime('%Y-%m-%d')
    yesterday = yesterday.strftime('%Y-%m-%d')

    # Initialize Alpaca API
    # Get the news data
    rest_client = REST(ALPACA_API_KEY, ALPACA_SECRET_KEY)
    news_list = rest_client.get_news(ticker,yesterday, today,limit=100)
    rest_client2 = REST(ALPACA_API_KEY, ALPACA_SECRET_KEY)
    bars = rest_client2.get_bars(ticker, TimeFrame(59, TimeFrameUnit.Minute), yesterday,today, adjustment='raw').df
    bars['time'] = bars.index.strftime('%Y-%m-%d %H:%M')
    bars['Ticker'] = ticker

    # Convert the news data into a pandas DataFrame
    data = {
        'Title': [],
        'time': [],
        'Source': [],
        'Url': [],
        'Summary': [],
        'Ticker' : []
    }

    for news in news_list:
        data['Title'].append(news.headline)
        data['time'].append(news.created_at.strftime('%Y-%m-%d %H:%M'))
        data['Source'].append(news.source)
        data['Url'].append(news.url)
        data['Summary'].append(fetch_news_summary(news.url))
        data['Ticker'].append(ticker)

    df = pd.DataFrame(data)
    df["time"] = pd.to_datetime(df["time"])
    bars["time"] = pd.to_datetime(bars["time"])

    return df,bars

if __name__ == "__main__":
    news_df,bars_data = get_news_data(API_KEY,API_SECRET,"TSLA")

In [4]:
from transformers import pipeline,AutoModel, AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import torch

def news_sentiment_handler(news):

      df_array = np.array(news[["Title"]])
      df_list = list(df_array[:,0])

      tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
      model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
      inputs = tokenizer(df_list, padding = True, truncation = True, return_tensors='pt') #tokenize text to be sent to model
      outputs = model(**inputs)
      predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

      model.config.id2label
      positive = predictions[:, 0].tolist()
      negative = predictions[:, 1].tolist()
      neutral = predictions[:, 2].tolist()
      table = {'Headline':df_list, "Positive":positive, "Negative":negative, "Neutral":neutral}
      sentiment_data = pd.DataFrame(table, columns = ["Headline", "Positive", "Negative", "Neutral"])

      return(sentiment_data)

sentiment_df = news_sentiment_handler(news_df)



Downloading (…)okenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
news_sentiment_df = news_df.merge(sentiment_df, left_on='Title', right_on='Headline')


In [53]:
merged_dataframe = pd.merge_asof(bars_data,news_sentiment_df.sort_values('time'), on="time", by="Ticker",
                                 tolerance=pd.Timedelta("60m"))
merged_dataframe = merged_dataframe.dropna()
merged_dataframe.count()


open           44
high           44
low            44
close          44
volume         44
trade_count    44
vwap           44
time           44
Ticker         44
Title          44
Source         44
Url            44
Summary        44
Headline       44
Positive       44
Negative       44
Neutral        44
dtype: int64

In [54]:
merged_dataframe['return'] = np.log(merged_dataframe['close']/merged_dataframe['close'].shift(1))
merged_dataframe['target'] = np.where(merged_dataframe['return'] > 0, 1, 0)
merged_dataframe=merged_dataframe.dropna()

In [55]:
model_input = merged_dataframe.drop(['time','Url','Summary','Headline','Title','Ticker','Source'], axis=1)
X = model_input.drop(['target'],axis = 1)
Y = model_input.target

In [58]:
from sklearn.model_selection import train_test_split

#splitting the data and building the training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1, stratify=Y)

In [59]:
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


def model_selection(X,Y):
 seed = 7
 models = []
 models.append(('LogisticRegression', LogisticRegression(random_state=seed)))
 models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis()))
 models.append(('KNeighborsClassifier', KNeighborsClassifier()))
 models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
 models.append(('RandomForestClassifier', RandomForestClassifier()))
 models.append(('ExtraTreesClassifier',ExtraTreesClassifier(random_state=seed)))
 models.append(('AdaBoostClassifier',AdaBoostClassifier(DecisionTreeClassifier(random_state=seed),random_state=seed,learning_rate=0.1)))
 models.append(('SVM',svm.SVC(random_state=seed)))
 models.append(('GradientBoostingClassifier',GradientBoostingClassifier(random_state=seed)))
 models.append(('MLPClassifier',MLPClassifier(random_state=seed)))
# evaluate each model in turn
 results = []
 names = []
 scoring = 'accuracy'
 for name, model in models:
  kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
  cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
  results.append(cv_results)
  names.append(name)
  msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
  print(msg)
 return results, names

results, names = model_selection(X_train,y_train)

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                 max_depth=1, random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)




LogisticRegression: 0.233333 (0.222985)
LinearDiscriminantAnalysis: 0.825000 (0.146487)
KNeighborsClassifier: 0.258333 (0.208999)
DecisionTreeClassifier: 0.975000 (0.075000)
RandomForestClassifier: 0.950000 (0.100000)
ExtraTreesClassifier: 0.841667 (0.205649)
AdaBoostClassifier: 0.975000 (0.075000)
SVM: 0.200000 (0.175594)
GradientBoostingClassifier: 0.975000 (0.075000)
MLPClassifier: 0.508333 (0.319396)


1.0