### Part 8
Here we continue with the python for finance series and visualize the big dataframe we just created.

In [1]:
%matplotlib qt
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
import pandas as pd

style.use('ggplot')

In [2]:
df = pd.read_csv('data/sp500_joined_closes.csv', index_col=0)

In [3]:
def visualize_data(df):
    df_corr = df.corr()
#     print(df_corr.head())
    
    data = df_corr.values
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    
    heatmap = ax.pcolor(data, cmap=plt.cm.RdYlGn)
    fig.colorbar(heatmap)
    ax.set_xticks(np.arange(data.shape[0]) + 0.5, minor=False)
    ax.set_yticks(np.arange(data.shape[1]) + 0.5, minor=False)
    ax.invert_yaxis()
    ax.xaxis.tick_top()
    
    column_labels = df_corr.columns
    row_labels = df_corr.index
    
    ax.set_xticklabels(column_labels)
    ax.set_yticklabels(row_labels)
    plt.xticks(rotation=90)
    heatmap.set_clim(-1, 1)
    plt.tight_layout()
    
#     df['AAPL'].plot()

In [4]:
# Careful! This will produce a BIG graph.
visualize_data(df)

### Part 9
Here we start preparing the data for ML

In [5]:
import pickle

In [6]:
def process_data_for_labels(ticker, df):
    hm_days = 7 # How many days in the future are we looking
    tickers = df.columns.values
    df.fillna(0, inplace=True)
    
    for i in range(1, hm_days+1):
        df['{}_{}d'.format(ticker, i)] =\
        (df[ticker].shift(-i) - df[ticker]) / df[ticker]
        
    df.fillna(0, inplace=True)
    return tickers, df

In [7]:
process_data_for_labels('XOM', df)

(array(['MMM', 'ABT', 'ABBV', 'ACN', 'ATVI', 'AYI', 'ADBE', 'AAP', 'AES',
        'AET', 'AMG', 'AFL', 'A', 'APD', 'AKAM', 'ALK', 'ALB', 'AGN', 'LNT',
        'ALXN', 'ALLE', 'ADS', 'ALL', 'GOOGL', 'GOOG', 'MO', 'AMZN', 'AEE',
        'AAL', 'AEP', 'AXP', 'AIG', 'AMT', 'AWK', 'AMP', 'ABC', 'AME',
        'AMGN', 'APH', 'APC', 'ADI', 'ANTM', 'AON', 'APA', 'AIV', 'AAPL',
        'AMAT', 'ADM', 'ARNC', 'AJG', 'AIZ', 'T', 'ADSK', 'ADP', 'AN',
        'AZO', 'AVB', 'AVY', 'BHI', 'BLL', 'BAC', 'BK', 'BCR', 'BAX', 'BBT',
        'BDX', 'BBBY', 'BRK-B', 'BBY', 'BIIB', 'BLK', 'HRB', 'BA', 'BWA',
        'BXP', 'BSX', 'BMY', 'AVGO', 'BF-B', 'CHRW', 'CA', 'COG', 'CPB',
        'COF', 'CAH', 'HSIC', 'KMX', 'CCL', 'CAT', 'CBG', 'CBS', 'CELG',
        'CNC', 'CNP', 'CTL', 'CERN', 'CF', 'SCHW', 'CHTR', 'CHK', 'CVX',
        'CMG', 'CB', 'CHD', 'CI', 'XEC', 'CINF', 'CTAS', 'CSCO', 'C', 'CFG',
        'CTXS', 'CLX', 'CME', 'CMS', 'COH', 'KO', 'CTSH', 'CL', 'CMCSA',
        'CMA', 'CAG', 'CXO', 'COP', '

### Part 10
Next we start creating the labels for future supervised learning by creating a helper function.

In [23]:
def buy_sell_hold(*args):
    cols = [c for c in args]
    requirement = 0.028 # This is the threshold for buying/selling
    for col in cols:
        if col > requirement:
            return 1
        elif col < -requirement:
            return -1
        else:
            return 0    

### Part 11
Now we use our helper function to mapaour data to buy/sell/hold accordingly.

In [20]:
from collections import Counter
def extract_featuresets(ticker, df):
    tickers, df = process_data_for_labels(ticker, df)
    
    hm_days = 7 # How many days in the future are we looking
#     for i in range(1, hm_days+1):
#         df['{}_{}d'.format(ticker, i)] =\
#         (df[ticker].shift(-i) - df[ticker]) / df[ticker]
    df['{}_target'.format(ticker)] = list(
        map(buy_sell_hold, *[df['{}_{}d'.format(ticker, i)] for i in range(1, hm_days+1)]))
    
    vals = df['{}_target'.format(ticker)].values.tolist()
    str_vals = [str(i) for i in vals]
    print('Data spread: {}'.format(Counter(str_vals)))
    
    df.fillna(0, inplace=True)
    df.replace([np.inf, -np.inf], np.nan)
    df.dropna(inplace=True)
    
    df_vals = df[[ticker for ticker in tickers]].pct_change()
    df_vals = df_vals.replace([np.inf, -np.inf], 0)
    df_vals.fillna(0, inplace=True)
    
    X, y = df_vals.values, df['{}_target'.format(ticker)].values
    
    return X, y, df

# extract_featuresets('XOM', df) # Just to check if it works

### Part 12
Here we use our created features to train a classifier with scikit-learn.

In [21]:
from sklearn import svm, neighbors, cross_validation
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier, RandomForestClassifier

In [24]:
def do_ml(ticker, df):
    X, y, df = extract_featuresets(ticker, df)
    
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25)
    
#     clf = neighbors.KNeighborsClassifier()
#     print(X_train.shape, y_train.shape); exit()
    clf = VotingClassifier([('lsvc', svm.LinearSVC()),
                            ('knn', neighbors.KNeighborsClassifier()),
                           ('rfor', RandomForestClassifier())], n_jobs=-1)
    
    clf.fit(X_train, y_train)
    
    confidence = clf.score(X_test, y_test)
    print("Accuracy:", confidence)
    predictions = clf.predict(X_test)
    print("Predicted spread: {}".format(Counter(predictions)))
    
    return confidence

do_ml('BAC', df)

Data spread: Counter({'0': 3578, '1': 365, '-1': 334})
Accuracy: 0.831775700935
Predicted spread: Counter({0: 1063, -1: 5, 1: 2})


0.83177570093457942