In [33]:
from database.market import Market
from database.sec import SEC
from database.adatabase import ADatabase
from modeler.modeler import Modeler as m
from processor.processor import Processor as p
from datetime import datetime, timedelta
import pytz
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pickle

In [34]:
market = Market()
sec = SEC()

In [35]:
market.connect()
sp5 = market.retrieve("sp500")
market.disconnect()

In [None]:
unrequired = ['adsh',
 'cik',
 'name',
 'sic',
 'countryba',
 'stprba',
 'cityba',
 'zipba',
 'bas1',
 'bas2',
 'baph',
 'countryma',
 'stprma',
 'cityma',
 'zipma',
 'mas1',
 'mas2',
 'countryinc',
 'stprinc',
 'ein',
 'former',
 'changed',
 'afs',
 'wksi',
 'fye',
 'form',
 'period',
 'fy',
 'fp',
 'date',
 'accepted',
 'prevrpt',
 'detail',
 'instance',
 'nciks',
 'aciks']

In [53]:
sec.connect()
training_data = []
training_columns = {}
for ticker in tqdm(list(sp5["Symbol"].unique())):
    try:
        cik = sp5[sp5["Symbol"]==ticker]["CIK"].item()
        filing = sec.retrieve_filing_data(cik)
        ticker_financials = filing.rename(columns={"filed":"date"})
        ticker_financials["date"] = [datetime.strptime(str(x),"%Y%m%d") for x in ticker_financials["date"]]
        ticker_financials["year"] = [x.year for x in ticker_financials["date"]]
        ticker_financials["quarter"] = [x.quarter for x in ticker_financials["date"]]
        ticker_financials = p.column_date_processing(ticker_financials)
        for col in ticker_financials.columns:
            if col in training_columns.keys():
                training_columns[col] += ticker_financials[col].dropna().index.size
            else:
                training_columns[col] = ticker_financials[col].dropna().index.size
        ticker_financials["ticker"] = ticker
    except Exception as e:
        print(ticker,str(e))
sec.disconnect()

 39%|██████████████████████████████████████████████████████████▋                                                                                            | 193/497 [01:25<02:43,  1.85it/s]

FRC 'date'


 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 410/497 [03:00<00:50,  1.71it/s]

SBNY 'date'


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 497/497 [03:37<00:00,  2.28it/s]


In [62]:
factors = [x for x in sorted(training_columns,key=training_columns.get,reverse=True)[:48] if x not in unrequired]
factors

['year',
 'quarter',
 'week',
 'assets',
 'liabilitiesandstockholdersequity',
 'incometaxexpensebenefit',
 'accumulatedothercomprehensiveincomelossnetoftax',
 'retainedearningsaccumulateddeficit',
 'earningspersharediluted',
 'earningspersharebasic',
 'weightedaveragenumberofsharesoutstandingbasic',
 'weightedaveragenumberofdilutedsharesoutstanding',
 'cashandcashequivalentsatcarryingvalue',
 'entitycommonstocksharesoutstanding',
 'stockholdersequity',
 'propertyplantandequipmentnet',
 'netincomeloss']

In [63]:
sec.connect()
plausible_pool = []
for ticker in tqdm(list(sp5["Symbol"].unique())):
    try:
        cik = sp5[sp5["Symbol"]==ticker]["CIK"].item()
        filing = sec.retrieve_filing_data(cik)
        ticker_financials = filing.rename(columns={"filed":"date"})
        ticker_financials["date"] = [datetime.strptime(str(x),"%Y%m%d") for x in ticker_financials["date"]]
        ticker_financials["year"] = [x.year for x in ticker_financials["date"]]
        ticker_financials["quarter"] = [x.quarter for x in ticker_financials["date"]]
        ticker_financials = p.column_date_processing(ticker_financials)
        ticker_financials["ticker"] = ticker
        col_count = 0
        for col in factors:
            if col not in ticker_financials.columns:
                continue
            else:
                col_count += 1
        if col_count == len(factors):
            plausible_pool.append(ticker)
    except Exception as e:
        print(str(e))
sec.disconnect()

 39%|██████████████████████████████████████████████████████████▋                                                                                            | 193/497 [00:53<01:39,  3.06it/s]

'date'


 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 412/497 [01:54<00:21,  3.95it/s]

'date'


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 497/497 [02:17<00:00,  3.62it/s]


In [64]:
len(plausible_pool)

391

In [41]:
sec.connect()
financial_data = []
for ticker in tqdm(plausible_pool):
    try:
        cik = sp5[sp5["Symbol"]==ticker]["CIK"].item()
        filing = sec.retrieve_filing_data(cik)
        ticker_financials = filing.rename(columns={"filed":"date"})
        ticker_financials["date"] = [datetime.strptime(str(x),"%Y%m%d") for x in ticker_financials["date"]]
        ticker_financials["year"] = [x.year for x in ticker_financials["date"]]
        ticker_financials["quarter"] = [x.quarter for x in ticker_financials["date"]]
        ticker_financials = p.column_date_processing(ticker_financials)
        final_set = ticker_financials[factors]
        final_set["ticker"] = ticker
        financial_data.extend(final_set.to_dict("records"))
    except Exception as e:
        print(str(e))
sec.disconnect()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 367/367 [01:43<00:00,  3.56it/s]


In [42]:
standardized_set = pd.DataFrame(financial_data)

In [43]:
dq = []
for col in standardized_set.columns:
    val = len([x for x in standardized_set[col].isna() if x == True]) / standardized_set.index.size
    dq.append({"col":col,"rate":val})

In [44]:
column_rates = pd.DataFrame(dq).sort_values("rate")

In [45]:
parse_two = standardized_set[column_rates[column_rates["rate"] < 0.10]["col"]]

In [46]:
final_parse = []
for ticker in list(parse_two["ticker"].unique()):
    ticker_data = parse_two[parse_two["ticker"]==ticker]
    for col in ticker_data.columns:
        try:
            ticker_data[col].fillna(ticker_data[col].mean(),inplace=True)
        except:
            continue
    final_parse.extend(ticker_data.to_dict("records"))

In [47]:
final_set = pd.DataFrame(final_parse)

In [48]:
dq = []
for col in final_set.columns:
    val = len([x for x in final_set[col].isna() if x == True]) / final_set.index.size
    dq.append({"col":col,"rate":val})

In [50]:
column_rates = pd.DataFrame(dq).sort_values("rate")
column_rates

Unnamed: 0,col,rate
0,year,0.0
17,netincomeloss,0.0
16,goodwill,0.0
15,propertyplantandequipmentnet,0.0
14,weightedaveragenumberofdilutedsharesoutstanding,0.0
13,weightedaveragenumberofsharesoutstandingbasic,0.0
12,cashandcashequivalentsatcarryingvalue,0.0
11,entitycommonstocksharesoutstanding,0.0
10,earningspersharediluted,0.0
9,earningspersharebasic,0.0
