In [88]:
import yfinance as yf 
import pandas as pd 
import numpy as np 
import datetime as dt

## Defining Helper Functions

In [None]:
def download_data(ticker, _start =dt.date(2000,1,1), _end = dt.date(2023,5,31)):
    return yf.download(ticker, start=_start, end = _end)
    
def calculate_monthly_rsi(df):
    df['diff'] = df['Adj Close'].diff(1)
    df['gain'] = df['diff'].clip(lower=0).round(2).fillna(0)
    df['loss'] = df['diff'].clip(upper=0).abs().round(2).fillna(0)
    window_length = 12
    df['avg_gain'] = df['gain'].rolling(window=window_length, min_periods=1).mean()[:window_length+1]
    df['avg_loss'] = df['loss'].rolling(window=window_length, min_periods=1).mean()[:window_length+1]
    for i, row in enumerate(df['avg_gain'].iloc[window_length+1:]):
        df['avg_gain'].iloc[i + window_length + 1] =(df['avg_gain'].iloc[i + window_length] *(window_length - 1) +df['gain'].iloc[i + window_length + 1])/ window_length
    for i, row in enumerate(df['avg_loss'].iloc[window_length+1:]):
        df['avg_loss'].iloc[i + window_length + 1] =(df['avg_loss'].iloc[i + window_length] * (window_length - 1) + df['loss'].iloc[i + window_length + 1])/window_length
        df['rs'] = df['avg_gain'] / df['avg_loss']
        df['rsi'] = 100 - (100 / (1.0 + df['rs']))
    return df

def post_Processing(df, cri):
    df1 = df.drop(columns=['tic'])
    df = df1.reset_index().dropna()
    df = df.rename(columns={'yyyy':'year', 'mm':'month'})
    cri1 = cri.merge(rsi_data, on=['year','month','tic'], how='inner')
    return cri1
def calculate_monthly_MACD(df):
    k = df['Close'].ewm(span=12, adjust=False, min_periods=0).mean()
    # Get the 12-day EMA of the closing price
    d = df['Close'].ewm(span=26, adjust=False, min_periods=0).mean()
    # Subtract the 26-day EMA from the 12-Day EMA to get the MACD
    macd = k - d
    # Get the 9-Day EMA of the MACD for the Trigger line
    macd_s = macd.ewm(span=9, adjust=False, min_periods=0).mean()
    # Calculate the difference between the MACD - Trigger for the Convergence/Divergence value
    macd_h = macd - macd_s
    # Add all of our new values for the MACD to the dataframe
    df['macd'] = df.index.map(macd)
    df['macd_h'] = df.index.map(macd_h)
    df['macd_s'] = df.index.map(macd_s)
    pd.set_option("display.max_columns", None)
    return df

def calculate_sortino_ratio(df):
    window = 12  # 12-month window
    df['Returns'] = df['Adj Close'].pct_change()
    df['Returns'].fillna(0, inplace=True)
    def sortino_ratio(returns):
        returns.apply(lambda x: 0 if x<0 else x )
        downside_deviation = np.std(returns)

        average_return = returns.mean()
        sortino = (average_return ) / downside_deviation if downside_deviation != 0 else 0
        return sortino

    df['Rolling_Sortino'] = df['Returns'].rolling(window).apply(sortino_ratio)

    return df

## Initial Data-Pull for all Unique Tickers from the CRI Dataset


In [None]:
company_PV_data = pd.DataFrame()
unique_Tickers = compustat_data_ALL['tic'].unique()
for i in unique_Tickers:
    try:
        temp_df = download_data(i) 
        temp_df.reset_index(inplace=True)
        temp_df['yyyy'] = temp_df.Date.apply(lambda x: x.year)
        temp_df['mm'] = temp_df.Date.apply(lambda x: x.month)
        temp_df.drop(['Date'],axis=1, inplace=True)
        grouped = temp_df.groupby(by = ['yyyy','mm']).mean()
        grouped.reset_index(inplace=True)
        grouped['tic'] = i
        company_PV_data = pd.concat([company_PV_data, grouped])
    except Exception as e:
        print(e)


# Loading in the values from YFinance data, which was aggregated monthly, then calculating the rolling mean of the volume

In [None]:
from pandas_gbq import read_gbq
df = read_gbq(f"""
SELECT *
FROM capstone-402105.raw_data.yfinance_cleaned
""",project_id='capstone-402105', dialect='standard')

In [366]:

df = pd.read_csv("YFinance_data_cleaned.csv")
df['date'] = pd.to_datetime(df['yyyy'].astype(str) + df['mm'].astype(str), format='%Y%m')
df.sort_values(by=['tic', 'date'], inplace=True)
results = pd.DataFrame()
for ticker, group in df.groupby('tic'):
    rolling_window = group.rolling('365D', on='date')
    group['rolling_mean'] = rolling_window['Volume'].mean()

    results = pd.concat([results, group])

results['1YearVolumeRatio'] = results['Volume']/results['rolling_mean']
MACD_data = df.groupby('tic').apply(calculate_monthly_MACD)
rsi_data = df.groupby('tic').apply(calculate_monthly_rsi)
rsi_data1 = rsi_data.drop(columns=['tic'])
rsi_data= rsi_data1.reset_index().dropna()
result = df.groupby('tic').apply(calculate_sortino_ratio)



Downloading:   0%|[32m                                                                                                                                                                                                                                                                               [0m|[0m


In [None]:


MACD_data = MACD_data[['yyyy', 'mm', 'tic', 'macd', 'macd_h', 'macd_s']]
rsi_data = rsi_data1[['yyyy', 'mm','tic','rsi']]
result.reset_index(inplace=True)
result = result[['yyyy', 'mm','tic','Rolling_Sortino']]
MACD_data.drop(columns=['tic'], inplace=True)
MACD_data.reset_index(inplace=True)
final_df = result.merge(MACD_data, on = ['yyyy', 'mm', 'tic'], how = 'inner')
final_df = final_df.merge(rsi_data,  on = ['yyyy', 'mm', 'tic'], how = 'inner')
final_df = final_df[['tic', 'year', 'month', 'Rolling_Sortino', 'macd', 'macd_h', 'macd_s', 'rsi']]
final_df

## Pushing the Data to BigQuery

In [349]:
import pandas as pd
from google.oauth2 import service_account
import pandas_gbq as gbq

In [351]:
credentials_path = '../token.json'

# Authenticate with your credentials
credentials = service_account.Credentials.from_service_account_file(
    credentials_path, scopes=['https://www.googleapis.com/auth/bigquery'])

# Set the credentials for pandas_gbq
gbq.context.credentials = credentials

In [352]:
import json 
from pandas_gbq import to_gbq
with open('../token.json', 'r') as token_file:
    token_data = json.load(token_file)
    project_id = token_data.get('project_id', 'default-project-id')

dataset_id = "raw_data"
table_id = 'yfinance'

to_gbq(yf_cleaned, destination_table=f'{project_id}.{dataset_id}.{table_id}', project_id=project_id, if_exists='replace')

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1663.75it/s]


In [353]:
final_df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.dropna(inplace=True)


In [354]:
import json 
from pandas_gbq import to_gbq
with open('../token.json', 'r') as token_file:
    token_data = json.load(token_file)
    project_id = token_data.get('project_id', 'default-project-id')

dataset_id = "capstone"
table_id = 'yfinance_cleaned'

to_gbq(final_df, destination_table=f'{project_id}.{dataset_id}.{table_id}', project_id=project_id, if_exists='replace')

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1430.53it/s]
