# Quarterly
## make sure you're only running the quarterly prediction component and pulling the existing model from the db

In [1]:
import pandas as pd
from database.market import Market
from database.sec import SEC
from modeler.modeler import Modeler as m
from preprocessor.model_preprocessor import ModelPreprocessor as mp
from strategy.quarterly_financial_categorical import QuarterlyFinancialCategorical
from datetime import datetime, timedelta
import numpy as np
import math
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime, timezone
import pickle

In [2]:
market = Market()
sec = SEC("sec")
qfc = QuarterlyFinancialCategorical()

In [3]:
market.connect()
sp5 = market.retrieve_data("sp500")
prices = market.retrieve_data("portfolio_prices")
market.close()

In [4]:
prices["year"]  = [x.year for x in prices["date"]]
prices["quarter"]  = [x.quarter for x in prices["date"]]

In [5]:
quarterly_grouped = prices.groupby(["year","quarter","ticker"]).max()
quarterly_grouped["category"] = [math.ceil(x / 100 ) * 100 for x in quarterly_grouped["adjclose"]]
quarterly_grouped["category"] = [250 if x > 100 else x for x in quarterly_grouped["category"]]
quarterly_grouped["category"] = [500 if x > 250 else x for x in quarterly_grouped["category"]]
quarterly_grouped["category"] = [1000 if x > 500 else x for x in quarterly_grouped["category"]]
quarterly_grouped["category"] = [2000 if x > 1000 else x for x in quarterly_grouped["category"]]
quarterly_grouped["category"] = [3000 if x > 2000 else x for x in quarterly_grouped["category"]]

In [6]:
quarterly_grouped.reset_index(inplace=True)
groups = quarterly_grouped.merge(sp5.rename(columns={"Symbol":"ticker"}), on = "ticker",how="left")

In [7]:
g = groups[["year","quarter","ticker","adjclose","category","GICS Sector","CIK"]]
g["string_category"] = [str(x) for x in g["category"]] 
g["classification"] = g["string_category"] + g["GICS Sector"]

In [8]:
enc = OneHotEncoder(handle_unknown='ignore')
transformed = [[x] for x in g["classification"]]
encoding = enc.fit_transform(transformed)

In [9]:
df_encoding = pd.DataFrame(encoding.toarray())

In [10]:
for col in df_encoding.columns:
    g[col] = df_encoding[col]

In [11]:
yearly_gap = 1
training_years = 1
fails = []
filings = []
columns = []
sec.connect()
for cik in tqdm(list(g["CIK"].unique())):
    try:
        filing = sec.retrieve_filing_data(int(cik))
        symbols = sp5[sp5["CIK"]==cik]["Symbol"]
        if symbols.index.size > 1:
            ticker = str(list(symbols)[0])
        else:
            ticker = symbols.item()
        drop_columns = ["cik","filed","_id","adsh"]
        funds = filing.copy()
        for column in funds.columns:
            if str(column).islower() and str(column) != "filed":
                drop_columns.append(column)
        funds["filed"] = [datetime.strptime(str(x),"%Y%m%d").replace(tzinfo=timezone.utc) if "-" not in str(x) else \
                            datetime.strptime(str(x).split(" ")[0],"%Y-%m-%d").replace(tzinfo=timezone.utc) for x in funds["filed"]]
        funds["quarter"] = [x.quarter for x in funds["filed"]]
        funds["year"] = [x.year + yearly_gap for x in funds["filed"]]
        funds["ticker"] = ticker
        funds.drop(drop_columns,axis=1,inplace=True)
        qa = funds.copy()
        for col in qa.columns:
            test = qa[col].fillna(-99999)
            availability = 1 - (len([x for x in test if x == -99999]) / qa.index.size)
            if availability < 0.95:
                funds.drop(col,inplace=True,axis=1)
        filings.append(funds)
        columns.append(list(funds.columns))
    except Exception as e:
        print("prep",ticker,str(e))
        fails.append([ticker,str(e)])
sec.close()
try:
    f = pd.concat(filings)
    for col in tqdm(f.columns):
        test = f[col].fillna(-99999)
        availability = len([x for x in test != -99999 if x == True]) / test.index.size
        if availability < 0.7:
            f.drop(col,axis=1,inplace=True)
except Exception as e:
    print("mid",str(e))
try:
    data = f.merge(g.drop(["string_category","classification","adjclose","category","GICS Sector","CIK"],axis=1),on=["year","quarter","ticker"],how="left")
    factors = list(data.columns)
    factors = [x for x in factors if x not in ["quarter","year","ticker"]]
    for i in range(22):
        factors.remove(i)
    for col in factors:
        data[col].replace([np.inf, -np.inf,np.nan,np.NaN],f[col].mean(),inplace=True)
except Exception as e:
    print(str(e))


 12%|████████████████▌                                                                                                                       | 60/494 [00:25<03:57,  1.83it/s]

prep BEN cannot convert float NaN to integer


 37%|██████████████████████████████████████████████████▌                                                                                    | 185/494 [01:19<01:46,  2.90it/s]

prep FRC 'filed'


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 494/494 [03:29<00:00,  2.36it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2406/2406 [03:40<00:00, 10.92it/s]

22





In [14]:
data[data["year"]==2020]["quarter"].max()

4

In [15]:
year_range = range(2021,2022)
year_gap = 1
for year in tqdm(year_range):
    try:
        training_data = data[(data["year"] < year) & (data["year"] >= year - year_gap)]
        factors = list(data.columns)
        factors = [x for x in factors if x not in ["quarter","year","ticker"]]
        for i in range(23):
            factors.remove(i)
        for col in factors:
            training_data[col].replace([np.inf, -np.inf,np.nan,np.NaN],training_data[col].mean(),inplace=True)
        training_data.dropna(inplace=True)
        x = training_data[factors]
        y = training_data[[x for x in range(23)]]
        prediction_data = data[(data["year"]==year)]
        refined_data = {"X":x.reset_index(drop=True),"y":y.reset_index(drop=True)}
        classification_models = m.classification(refined_data.copy(),deep=False,tf=False,sk=False,xgb=True,multioutput=True)
        models = pd.DataFrame([classification_models])
        model = models["model"].item()
        for col in factors:
            prediction_data[col].replace([np.inf, -np.inf,np.nan,np.NaN],prediction_data[col].mean(),inplace=True)
        prediction_data.dropna(inplace=True)
        predictions = enc.inverse_transform(model.predict(prediction_data[factors]))
        prediction_data["prediction"] = [x[0] for x in predictions]
        sim = prediction_data[["year","quarter","ticker","prediction"]]
    except Exception as e:
        print(str(e))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.21s/it]


In [22]:
qfc.db.connect()
qfc.db.store_data("portfolio_stock_category_sim",sim[sim["quarter"]==3])
qfc.db.close()

In [24]:
models["model"] = [pickle.dumps(x) for x in models["model"]]

In [26]:
qfc.db.connect()
qfc.db.store_data("portfolio_stock_category_model",models)
qfc.db.close()