# Neural Net vs. Goldman
Can an LSTM neural net trained on fundamentals extracted from edgar XBRL pick the same long and short list as Goldman's hedgefund meta list

## Data Sources
Download price and fundamental data using [pystock-crawler](https://github.com/eliangcs/pystock-crawler):  

```
pystock-crawler symbols NYSE,NASDAQ,AMEX --sort -w data -o data/symbols.csv -l data/symbols.log
pystock-crawler reports data/symbols.csv -w data -o data/reports.csv -l data/reports.log
pystock-crawler prices data/symbols.csv -w data -o data/prices.csv -l data/prices.log
```

In [81]:
import numpy as np
import pandas
reports = pandas.read_csv('data/reports.csv', parse_dates=['end_date'], index_col=1).groupby('symbol')
prices = pandas.read_csv('data/prices.csv', parse_dates=['date'], index_col=1).groupby('symbol')
print "Loaded", len(reports), "reports and", len(prices), "prices"

Loaded 4514 reports and 6103 prices


In [83]:
WINDOW_SIZE = 4
data = [{"symbol": s, 
         "reports": reports.get_group(s).sort(ascending=True),
         "prices": prices.get_group(s).sort(ascending=True)} 
        for s, r in reports.filter(lambda r: r['symbol'] in prices.groups)]
# data[0]["symbol"]

# reports.filter(lambda g: len(g) >= WINDOW_SIZE)

TypeError: 'Series' objects are mutable, thus they cannot be hashed

In [69]:
pandas.notnull(prices.get_group('23'))




KeyError: '23'

In [24]:
# Debugging to verify start and end values
symbol = 'A'
r = reports[reports.symbol == symbol].sort(ascending=True)
p = prices[prices.symbol == symbol].sort(ascending=True)
print "Found", len(r), "reports for", symbol

# Add adjusted closing stock price start days after the report and end days after the report
r['start'] = r.index.map(lambda x: p.ix[p.index[p.index.searchsorted(x, side='right')]]['adj_close'])
r['end'] = r.index.map(lambda x: p.ix[p.index[p.index.searchsorted(x, side='right') + 2]]['adj_close'])
r[['start', 'end']]

Found 25 reports for A


Unnamed: 0_level_0,start,end
end_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2009-07-31,16.548454,16.258251
2009-10-31,17.322323,17.308499
2010-01-31,20.127615,20.410911
2010-04-30,25.406537,24.017708
2010-07-31,19.913413,19.547208
2010-10-31,24.010801,24.259546
2011-01-31,29.054792,28.322381
2011-04-30,34.85193,34.202439
2011-07-31,28.184192,27.085567
2011-10-31,24.383923,25.65528


In [145]:
import sys
def features(symbol, reports, prices, window_size=4, overlap=1, start_days=1, end_days=10):
    """
    Return an X vector consisting a time series of finanical metric vectors
    for the given stock split into multiple normalized overlapping windows 
    and y, the % change in the stock price from start to end days after 
    the last report in the window.
    """
    
    # Get the data for just this symbol
    r = reports[reports.symbol == symbol].sort(ascending=True)
    p = prices[prices.symbol == symbol].sort(ascending=True)
    
    if len(r) < 4 or len(p) == 0:
        print "Skipping {}: {} reports and {} prices".format(symbol, len(r), len(p))
        return None, None
       
    # Add adjusted closing price start and end days after the report
    r['start'] = r.index.map(lambda x: p.ix[p.index[p.index.searchsorted(x, side='right') + start_days]]['adj_close'])
    r['end'] = r.index.map(lambda x: p.ix[p.index[p.index.searchsorted(x, side='right') + end_days]]['adj_close'])

    # Fixup annual 10-k numbers by subtracting the prior 3 quarters
    # REMIND: Go back and verify the adjustments
    for c in ['revenues', 'op_income', 'net_income',
     'eps_basic', 'eps_diluted', 'dividend',
     'cash_flow_op', 'cash_flow_inv', u'cash_flow_fin']:
        r[c + '_adj'] = r[c] - r[c].shift(1) - r[c].shift(2) - r[c].shift(3)
        r.ix[r.period_focus == 'FY', c] = r[r.period_focus == 'FY'][c + '_adj']
    
    # Delete all non-numeric columns
    r = r.ix[:,5:-9]
        
    # Change any nan to -1
    r.fillna(-1, inplace=True)
    
    # Divide into overlapping windows
    num_windows = len(reports[reports.symbol == symbol]) - window_size + 1
    X = [r[i:i + window_size] for i in range(len(r) - window_size + 1 - num_windows,len(r) - window_size + 1, overlap)]

    # Calculate +/- % change in stock price n days after the last report in the window
    y = [(x.end[-1] / x.start[-1]) - 1 for x in X]
    
    print "\r{}:{}".format(symbol, len(r)),
    sys.stdout.flush()
    
    if x.shape != (4,16):
        print "Problems with", symbol
        assert False
        
    # Return X with the end price redacted so we don't train on the answer!
    return [x.values[:,0:-1] for x in X], y

In [156]:
data = {s: features(s, reports, prices) for s in symbols['symbol'][0:10]}
# data = {s: features(s, reports, prices) for s in symbols['symbol']}
data = {k: v for k,v in data.items() if v[0]}
print "Generated features vectors for {} stocks".format(len(data))

AA:26 Skipping AAC: 3 reports and 234 prices
Skipping AAL: 3 reports and 2503 prices
AAP:22 Generated features vectors for 8 stocks


KeyboardInterrupt: 

['A',
 'AA',
 'AAMC',
 'AAME',
 'AAN',
 'AAOI',
 'AAON',
 'AAP',
 'AAPL',
 'AAT',
 'AAVL',
 'AAWW',
 'AB',
 'ABAX',
 'ABBV',
 'ABC',
 'ABCB',
 'ABCD',
 'ABCO',
 'ABCW',
 'ABG',
 'ABIO',
 'ABM',
 'ABMD',
 'ABR',
 'ABT',
 'ABTL',
 'ACAD',
 'ACAT',
 'ACC',
 'ACE',
 'ACET',
 'ACFC',
 'ACGL',
 'ACHC',
 'ACHN',
 'ACI',
 'ACIW',
 'ACLS',
 'ACM',
 'ACN',
 'ACNB',
 'ACOR',
 'ACPW',
 'ACRE',
 'ACRX',
 'ACTA',
 'ACTG',
 'ACU',
 'ACUR',
 'ACV',
 'ACW',
 'ACXM',
 'ACY',
 'ADAT',
 'ADBE',
 'ADC',
 'ADEP',
 'ADGE',
 'ADI',
 'ADK',
 'ADM',
 'ADMA',
 'ADMP',
 'ADMS',
 'ADP',
 'ADPT',
 'ADS',
 'ADSK',
 'ADT',
 'ADTN',
 'ADUS',
 'ADXS',
 'AE',
 'AEE',
 'AEGN',
 'AEGR',
 'AEHR',
 'AEIS',
 'AEL',
 'AEO',
 'AEP',
 'AEPI',
 'AERI',
 'AES',
 'AET',
 'AETI',
 'AEY',
 'AF',
 'AFAM',
 'AFCB',
 'AFFX',
 'AFG',
 'AFH',
 'AFL',
 'AFOP',
 'AFSI',
 'AGCO',
 'AGEN',
 'AGII',
 'AGIO',
 'AGM',
 'AGN',
 'AGNC',
 'AGO',
 'AGRX',
 'AGTC',
 'AGX',
 'AGYS',
 'AHC',
 'AHGP',
 'AHH',
 'AHL',
 'AHP',
 'AHPI',
 'AHS',
 'AHT',
 '

In [2]:
import os
import sys
from ipyparallel import Client

def foo(symbol):
    return symbol + " stuff"
    
try:
    symbols = [f.split(".")[0] for f in os.listdir("data/reports") if f not in ".DS_Store"]
    client = Client()
    view = client.load_balanced_view()
    r = view.map(foo, symbols[0:100])
    r.wait_interactive()
except KeyboardInterrupt:
    print "\nCaught KeyboardInterrupt, terminating workers"
    r.abort()
else:
    print "\nFinished computing features"

 100/100 tasks finished after    0 s
done

Finished computing features
['A stuff', 'AA stuff', 'AAMC stuff', 'AAME stuff', 'AAN stuff', 'AAOI stuff', 'AAON stuff', 'AAP stuff', 'AAPL stuff', 'AAT stuff']


In [2]:
import pandas as pd
import multiprocessing as mp

LARGE_FILE = "data/reports.csv"
CHUNKSIZE = 100000 # processing 100,000 rows at a time

def process_frame(df):
        # process data frame
        return len(df)

if __name__ == '__main__':
        reader = pd.read_table(LARGE_FILE, chunksize=CHUNKSIZE)
        pool = mp.Pool(4) # use 4 processes

        funclist = []
        for df in reader:
                # process each data frame
                f = pool.apply_async(process_frame,[df])
                funclist.append(f)

        result = 0
        for f in funclist:
                result += f.get(timeout=10) # timeout in 10 seconds

        print "There are %d rows of data"%(result)

There are 69430 rows of data


In [157]:
# Check that window columns are equal ie 2nd time slice is equal to 1st in successive
assert(np.array_equal(data['A'][0][0][1], data['A'][0][1][0]))

In [158]:
# Split into train/test sets
# Should use sklearn.cross_validation.StratifiedShuffleSplit to try and maintain industry sector % in each
# Or bin by financial size http://www.gregreda.com/2013/10/26/using-pandas-on-the-movielens-dataset/
# s = XY['symbol'].unique()
from sklearn.cross_validation import train_test_split
train_symbols, test_symbols = train_test_split(data.keys(), test_size = 0.17)
print "num train symbols:", len(train_symbols), "num test symbols:", len(test_symbols)
X_train = [data[s][0] for s in train_symbols]
y_train = [data[s][1] for s in train_symbols]
X_test = [data[s][0] for s in test_symbols]
y_test = [data[s][1] for s in test_symbols]
print "test/train %", 1.0 * len(X_test)/len(X_train)

num train symbols: 6 num test symbols: 2
test/train % 0.333333333333


In [176]:
for item in X_train:
    for l in item:
        assert l.shape == (4, 15)

In [163]:
def norm(a):
    return a / a.sum()

def flatten(l):
    return [item.astype(np.float32) for sublist in l for item in sublist]

def flatten_and_norm(l):
    return [norm(item.astype(np.float32)) for sublist in l for item in sublist]

In [175]:
Xtr = flatten_and_norm(X_train)
ytr = flatten(y_train)
Xte = flatten_and_norm(X_test)
yte = flatten(y_test)
assert(Xte[0].sum() - 1.0) < 0.0000000005
assert np.max(ytr) < 1
assert np.min(ytr) > -1
print "Max min stock price change:", np.max(ytr), np.min(ytr)

Max min stock price change: 0.28097 -0.160023


In [36]:
import numpy
with open("train_test.npz", "wb") as f:
    numpy.savez(f, X_train=Xtr, y_train=ytr, X_test=Xte, y_test=yte)