In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf
import plotly.express as px

# Data Exploration

In [None]:
data_1 = pd.read_csv("data/daily_financial_news/analyst_ratings_processed.csv", index_col=0)
data_1.head()

In [None]:
data_1['date'] = data_1['date'].str.split(' ', expand=True).iloc[:, 0]

In [None]:
data_1.head()

In [None]:
data_1['stock'].nunique()

In [None]:
data_1.info()

## Drop Null Values

In [None]:
print(data_1.isna().sum())

In [None]:
print(data_1.shape)

In [None]:
data_1[data_1['date'].isna()]

In [None]:
data_1.dropna(subset=['date'],inplace=True)
data_1.dropna(subset=['stock'],inplace=True)

In [None]:
print(data_1.shape)

In [None]:
print(data_1.isna().sum())

In [None]:
data_1.info()

## Filter stocks

In [None]:
# Filter dataset down to stocks with top 100 number of headlines
top_100_stocks_by_headlines = data_1.groupby('stock').size().reset_index(name='size').sort_values('size', ascending=False).reset_index(drop=True).iloc[:100]
top_100_stocks_by_headlines.head()

In [None]:
data_1 = data_1[data_1['stock'].isin(top_100_stocks_by_headlines.stock)]

In [None]:
data_1.shape

## Clean Text Data

In [None]:
data_1['title'] = data_1['title'].str.lower()

In [None]:
data_1.head()

### Remove Punctuations

In [None]:
import string

def remove_punctuation(text):
    return ''.join([char for char in text if char not in string.punctuation and not char.isdigit()])

data_1['title'] = data_1['title'].apply(remove_punctuation)

In [None]:
data_1.head()

In [None]:
data_1['tokens'] = data_1['title'].apply(lambda x: x.split())
data_1.head()

### Remove stop words

In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

stop = stopwords.words('english')

data_1['tokens'] = data_1['tokens'].apply(lambda x: [word for word in x if word not in stop])

In [None]:
data_1.shape

### Lemmatize tokens

In [None]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

data_1['tokens'] = data_1['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])


In [None]:
data_1.head()

In [None]:
data_1['preprocessed_text'] = data_1['tokens'].apply(' '.join)
data_1.head()

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
# NLTK Sentiment Intensity Analyzer uses a 'Bag of Words' approach
# it removes stop words and scores each word individually before compounding

In [None]:
data_duplicate = data_1.copy()
data_duplicate['sentiment_score'] = data_duplicate['preprocessed_text'].apply(lambda x: sia.polarity_scores(x)['compound'])
data_duplicate

In [None]:
data_duplicate['sentiment_score'].describe()

In [None]:
data_duplicate.info()

In [None]:
data_duplicate.head()

In [None]:
signals_df = data_duplicate[['date', 'stock', 'sentiment_score']].copy()

In [None]:
signals_df.dtypes

In [None]:
# There are some stocks that have multiple news articles on the same day
# Have to handle these cases
non_dup_signals_df = signals_df.groupby(['date', 'stock'])['sentiment_score'].mean().reset_index(name='sentiment_score')

# Pull yfinance data

In [None]:
tickers = data_duplicate.stock.unique()

In [None]:
start_date, end_date = data_duplicate.date.sort_values().iloc[0], data_duplicate.date.sort_values().iloc[-1]

In [None]:
data = yf.download(list(tickers), start=start_date, end=end_date)

In [None]:
adj_close_data = data['Adj Close']

In [None]:
# Tickers that don't have data
missing_data_tickers = adj_close_data.columns[adj_close_data.isna().sum()/adj_close_data.shape[0] == 1]

# Drop missing tickers
adj_close_data = adj_close_data.drop(columns=missing_data_tickers)

In [None]:
# Drop other tickers
adj_close_data = adj_close_data.dropna(axis=1)

In [None]:
any(adj_close_data.isna().sum() > 0)

In [None]:
returns_df = adj_close_data.pct_change().dropna().reset_index().rename(columns={'Date': 'date'})

In [None]:
returns_df_melt = returns_df.melt(id_vars='date', var_name='stock', value_name='daily_returns')
returns_df_melt['date'] = pd.to_datetime(returns_df_melt['date'])
returns_df_melt.head()

In [None]:
non_dup_signals_df.sort_values('date').head()

In [None]:
returns_df_melt.dtypes

In [None]:
non_dup_signals_df['date'] = pd.to_datetime(non_dup_signals_df['date'])

In [None]:
ml_df = pd.merge(returns_df_melt, non_dup_signals_df, on=['date', 'stock'], how='left').dropna()

# Build Machine Learning Models

In [None]:
ml_df

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Split Data into Features and Target
# Sort values by date
X = ml_df.sort_values("date").drop('daily_returns', axis=1).reset_index(drop=True)
y = ml_df.sort_values("date")['daily_returns'].reset_index(drop=True)

In [None]:
def ts_train_test_split(data, test_size):
    """Takes in data and output train set and test set in that order

    Args:
        data (pd.DataFrame or pd.Series): Data to split into train and test
        test_size (float): Percentage for test size

    Returns:
        tuple: train set, test set
    """
    train_size = 1-test_size
    train_idx = round(X.shape[0] * train_size)
    return data.iloc[:train_idx], data.iloc[train_idx:]

In [None]:
# Split Data into Training and Validation Sets
X_train, X_valid = ts_train_test_split(X, test_size=0.2)
y_train, y_valid = ts_train_test_split(y, test_size=0.2)

In [None]:
# Initialize OneHotEncoder
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# One-hot Encode 'stock' Column for Training and Validation Data
OH_cols_train = pd.DataFrame(one_hot_encoder.fit_transform(X_train[['stock']]))
OH_cols_valid = pd.DataFrame(one_hot_encoder.transform(X_valid[['stock']]))

# Assign Column Names after One-Hot Encoding and Restore Index
OH_cols_train.columns = one_hot_encoder.get_feature_names_out(['stock'])
OH_cols_valid.columns = one_hot_encoder.get_feature_names_out(['stock'])
OH_cols_train.index, OH_cols_valid.index = X_train.index, X_valid.index

# Remove Original 'stock' Column
num_X_train = X_train.drop('stock', axis=1)
num_X_valid = X_valid.drop('stock', axis=1)

# Concatenate Original Data with One-Hot Encoded Columns
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

# Ensure all columns have string type
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

# Ensure 'date' is in datetime format if it is used in further analyses
OH_X_train = OH_X_train.drop(columns=['date'])
OH_X_valid = OH_X_valid.drop(columns=['date'])

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [None]:
def score_model(model, X_t, X_v, y_t, y_v):
    # Fit Model
    model.fit(X_t, y_t)

    # Predict
    preds = model.predict(X_v)

    # Check MAE
    mae = mean_absolute_error(preds, y_v)
    return mae

In [None]:
linear_model = LinearRegression()

In [None]:
score_model(linear_model, OH_X_train, OH_X_valid, y_train, y_valid)