# Neural Net vs. Goldman
Can an LSTM neural net trained on fundamentals extracted from edgar XBRL pick the same long and short list as Goldman's hedgefund meta list

## Data Sources
Download price and fundamental data using [pystock-crawler](https://github.com/eliangcs/pystock-crawler):  

```
pystock-crawler symbols NYSE,NASDAQ,AMEX --sort -w data -o data/symbols.csv -l data/symbols.log
pystock-crawler reports data/symbols.csv -w data -o data/reports.csv -l data/reports.log
pystock-crawler prices data/symbols.csv -w data -o data/prices.csv -l data/prices.log
```

In [174]:
import numpy as np
import pandas
reports = pandas.read_csv('data/reports.csv', parse_dates=['end_date'], index_col=1).sort(ascending=True).groupby('symbol')
prices = pandas.read_csv('data/prices.csv', parse_dates=['date'], index_col=1).sort(ascending=True).groupby('symbol')
print "Found", len(reports), "symbols in reports and", len(prices), "symbols in prices"

 Found 4514 symbols in reports and 6103 symbols in prices


In [266]:
import sys

def norm(a):
    return a.astype(np.float32) / a.sum()

def features(symbol, reports, prices, window_size=4, overlap=1, start_days=1, end_days=5, pct_threshold=0.10):
    """
    Return an X vector consisting a time series of finanical metric vectors
    for the given stock split into multiple normalized overlapping windows 
    and y % change in the stock price from start to end days after 
    the last report in the window.
    """
    r = reports.get_group(symbol).copy() # We're going to modify columns so need to copy
    p = prices.get_group(symbol)         # no modifications
    
    print "\r{}:{}".format(symbol, len(r)),
    sys.stdout.flush()
    
    if len(r) < window_size:
        raise ValueError("{} only has {} reports".format(symbol, len(r)))
        
    if len(p) < window_size + start_days + end_days:
        raise ValueError("{} only has {} prices".format(symbol, len(p)))

    # Add adjusted closing price start and end days after the report
    r['start'] = r.index.map(lambda x: p.ix[p.index[p.index.searchsorted(x, side='right') + start_days]]['adj_close'])
    r['end'] = r.index.map(lambda x: p.ix[p.index[p.index.searchsorted(x, side='right') + end_days]]['adj_close'])

    # Fixup annual 10-k numbers by subtracting the prior 3 quarters
    # REMIND: Go back and verify the adjustments
    for c in ['revenues', 'op_income', 'net_income',
     'eps_basic', 'eps_diluted', 'dividend',
     'cash_flow_op', 'cash_flow_inv', u'cash_flow_fin']:
        r[c + '_adj'] = r[c] - r[c].shift(1) - r[c].shift(2) - r[c].shift(3)
        r.ix[r.period_focus == 'FY', c] = r[r.period_focus == 'FY'][c + '_adj']
    
    # Delete all non-numeric columns
    r = r.ix[:,5:-9]
        
    # Change any nan to -1
    r.fillna(-1, inplace=True)
    
    # Divide into overlapping windows
    num_windows = len(r) - window_size + 1
    X = [r[i:i + window_size] for i in range(len(r) - window_size + 1 - num_windows,len(r) - window_size + 1, overlap)]

    # Calculate +/- % change in stock price n days after the last report in the window
    y = [(x.end[-1] / x.start[-1]) - 1 for x in X]
                
    # Return X with the end price redacted so we don't train on the answer!
    return [norm(x.values[:,0:-1]) for x in X], y




In [271]:
def compute_all_features(reports, prices):
    for s in reports.groups.keys()[0:100]:
        try:
            yield s, features(s, reports, prices)
        except ValueError as e:
            print "Problems with {} {}".format(s, e)
        except:
            print "Unknown problem with {}".format(s)
            
data = {s: f for s,f in compute_all_features(reports, prices)}

ISDR:25 Unknown problem with AGL
GTIM:19 Unknown problem with NYB
NRCIB:2 Problems with NRCIB NRCIB only has 2 reports
Unknown problem with SPX
BWA:25 Unknown problem with BWC
SPN:21 Unknown problem with BSDM
MEIL:1 Problems with MEIL MEIL only has 1 reports
EARN:10 Unknown problem with GY
MXPT:2 Problems with MXPT MXPT only has 2 reports
Unknown problem with HANS
GI:1 Problems with GI GI only has 1 reports
ACLS:17 Unknown problem with AMCH
JBSS:16 Unknown problem with VIGS
BEL:3 Problems with BEL BEL only has 3 reports
IRG:13 Unknown problem with NTKS
GPIAU:2 Problems with GPIAU GPIAU only has 2 reports
VLY:22 Unknown problem with ICMC


In [285]:
import multiprocessing as mp

def call_features(data):
    return features(data[0], data[1], data[2])

pool = mp.Pool(processes=4)
results = pool.map(cube, range(1,10000000))
print "Done"

Done


In [259]:
# Check that window columns are equal ie 2nd time slice is equal to 1st in successive
# s = reports.groups.keys()[0]
# assert(np.array_equal(data[s][0][0][1], data[s][0][1][0]))
# Can't do this anymore because we normalized within a window
for k, v in data.iteritems():
    for l in v[0]:
        assert l.shape == (4, 15)
        assert (l.sum() - 1.0) < 0.001
#     print k, np.max(v[1]), np.min(v[1])

In [258]:
# Split into train/test sets
# Should use sklearn.cross_validation.StratifiedShuffleSplit to try and maintain industry sector % in each
# Or bin by financial size http://www.gregreda.com/2013/10/26/using-pandas-on-the-movielens-dataset/
# s = XY['symbol'].unique()
from sklearn.cross_validation import train_test_split
train_symbols, test_symbols = train_test_split(data.keys(), test_size = 0.20)
print "num train symbols:", len(train_symbols), "num test symbols:", len(test_symbols)
X_train = [data[s][0] for s in train_symbols]
y_train = [data[s][1] for s in train_symbols]
X_test = [data[s][0] for s in test_symbols]
y_test = [data[s][1] for s in test_symbols]
print "test/train %", 1.0 * len(X_test)/len(X_train)
print "stock change min/max {}/{}".format(np.min(np.min(y_train)), np.max(np.max(y_train)))

num train symbols: 66 num test symbols: 17
test/train % 0.257575757576
stock change min/max -0.138187264082/0.133333285848


In [223]:
def flatten(l):
    return [item for sublist in l for item in sublist]

import numpy
with open("train_test.npz", "wb") as f:
    numpy.savez(f, X_train=flatten(X_train), y_train=flatten(y_train), 
                X_test=flatten(X_test), y_test=flatten(y_test))