# Neural Net vs. Goldman
Can an LTSM Neural Net trained on fundamentals extracted from edgar XBRL from the S&P500 pick the same long and short list as Goldman's hedgefund meta list (appologies to my high school english teacher for the run on sentence)

## Data Sources
Download price and fundamental data for the S&P 500 using [pystock-crawler](https://github.com/eliangcs/pystock-crawler):  

`pystock-crawler reports ../tickers.csv -o ../reports.csv ../reports.log`  
`pystock-crawler prices ../tickers.csv -o ../prices.csv -l ../prices.log`

REMIND: Use pystock-crawler symbols to get symbols for training input...

X = 4 quarters of fundamental data and whether stock was up or down from the prior quarter y = whether the stock was up or down x days after the period that X was comprised of

In [1]:
import pandas
prices = pandas.read_csv('prices.csv', parse_dates=['date'], index_col=1)
reports = pandas.read_csv('reports.csv', parse_dates=['date'], index_col=1)
reports = reports[reports.amend == False]
symbols = pandas.read_csv('symbols.csv').sort('symbol').sort('symbol')
print "Loaded", len(symbols), "symbols and", len(reports), "reports"

Loaded 502 symbols and 10851 reports


In [2]:
# Debugging to verify start and end values
# symbol = 'A'
# r = reports[reports.symbol == symbol].sort(ascending=True)
# p = prices[prices.symbol == symbol].sort(ascending=True)
# print "Found", len(r), "reports for", symbol

# # Add adjusted closing stock price start days after the report and end days after the report
# r['start'] = r.index.map(lambda x: p.ix[p.index[p.index.searchsorted(x, side='right')]]['adj_close'])
# r['end'] = r.index.map(lambda x: p.ix[p.index[p.index.searchsorted(x, side='right') + 2]]['adj_close'])
# r[['start', 'end']]

In [3]:
def features(symbol, reports, prices, window_size=4, overlap=1, days_after=10):
    """
    Return a 2d vector consisting of the data for the given stock split into multiple
    possibly overlapping windows along with a y value showing the % change in the 
    stock price days_after the end of the window.
    """
    
    # Get the data for just this symbol
    r = reports[reports.symbol == symbol].sort(ascending=True)
    p = prices[prices.symbol == symbol].sort(ascending=True)
    print "Found", len(r), "reports for", symbol

    # Add adjusted closing price start and end days after the report
    r['start'] = r.index.map(lambda x: p.ix[p.index[p.index.searchsorted(x, side='right')]]['adj_close'])
    r['end'] = r.index.map(lambda x: p.ix[p.index[p.index.searchsorted(x, side='right') + 2]]['adj_close'])

    # Fixup annual 10-k numbers by subtracting the prior 3 quarters
    # REMIND: Go back and verify the adjustments
    for c in ['revenues', 'op_income', 'net_income',
     'eps_basic', 'eps_diluted', 'dividend',
     'cash_flow_op', 'cash_flow_inv', u'cash_flow_fin']:
        r[c + '_adj'] = r[c] - r[c].shift(1) - r[c].shift(2) - r[c].shift(3)
        r.ix[r.period_focus == 'FY', c] = r[r.period_focus == 'FY'][c + '_adj']
        
    # Delete all non-numeric columns
    r = r.ix[:,5:-9]
    
    # Change any nan to -1
    r.fillna(-1, inplace=True)
    
    # Divide into overlapping windows
    num_windows = len(reports[reports.symbol == symbol]) - window_size + 1
    X = [r[i:i + window_size] for i in range(len(r) - window_size + 1 - num_windows,len(r) - window_size + 1, overlap)]

    # Calculate % rise in stock price n days after the last report in the window
    y = [1 - (x.end[-1] / x.start[-1]) for x in X]
    
    # Return X with the end price redacted so we don't train on the answer!
    return [x.values[:,0:-1] for x in X], y

In [4]:
data = {s: features(s, reports, prices) 
#         for s in symbols['symbol'][0:500:20] if len(reports[reports.symbol == s]) >= 4}
        for s in symbols['symbol'] if len(reports[reports.symbol == s]) >= 4}
print "Generated features vectors for", len(data),"stocks"

Found 22 reports for A
Found 25 reports for AA
Found 24 reports for AAPL
Found 9 reports for ABBV
Found 24 reports for ABC
Found 24 reports for ABT
Found 24 reports for ACE
Found 22 reports for ACN
Found 4 reports for ACT
Found 25 reports for ADBE
Found 24 reports for ADI
Found 20 reports for ADM
Found 23 reports for ADP
Found 20 reports for ADS
Found 25 reports for ADSK
Found 10 reports for ADT
Found 24 reports for AEE
Found 24 reports for AEP
Found 24 reports for AES
Found 24 reports for AET
Found 24 reports for AFL
Found 24 reports for AIG
Found 20 reports for AIV
Found 25 reports for AIZ
Found 23 reports for AKAM
Found 24 reports for ALL
Found 5 reports for ALLE
Found 24 reports for ALTR
Found 20 reports for ALXN
Found 25 reports for AMAT
Found 24 reports for AME
Found 20 reports for AMG
Found 24 reports for AMGN
Found 24 reports for AMP
Found 24 reports for AMT
Found 24 reports for AMZN
Found 20 reports for AN
Found 22 reports for AON
Found 24 reports for APA
Found 24 reports for 

In [5]:
import numpy as np
assert(np.array_equal(data['A'][0][0][1], data['A'][0][1][0]))

In [6]:
# Split into train/test sets
# Should use sklearn.cross_validation.StratifiedShuffleSplit to try and maintain industry sector % in each
# Or bin by financial size http://www.gregreda.com/2013/10/26/using-pandas-on-the-movielens-dataset/
# s = XY['symbol'].unique()
from sklearn.cross_validation import train_test_split
train_symbols, test_symbols = train_test_split(data.keys(), test_size = 0.2)
print "num train symbols:", len(train_symbols), "num test symbols:", len(test_symbols)
X_train = [data[s][0] for s in train_symbols]
y_train = [data[s][1] for s in train_symbols]
X_test = [data[s][0] for s in test_symbols]
y_test = [data[s][1] for s in test_symbols]
print "test/train %", 1.0 * len(X_test)/len(X_train)

num train symbols: 378 num test symbols: 95
test/train % 0.251322751323


In [7]:
def flatten(l):
    return [item.astype(np.float32) for sublist in l for item in sublist]

In [9]:
import numpy
with open("train_test.npz", "wb") as f:
    numpy.savez(f, X_train=flatten(X_train), y_train=flatten(y_train), X_test=flatten(X_test), y_test=flatten(y_test))