In [1]:
#Import Required Packages
import json
import os
import logging as log
import requests
#import yfinance as yf
import time as time

import numpy as np
import pandas as pd
#from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns

from collections import deque

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.svm import LinearSVR

import keras
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense

import warnings 
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score

Using TensorFlow backend.


In [2]:
#Define File Paths as Constants
#TWITS_PATH = r"../stocktwits_prediction/Data/twits"
#FINANCE_PATH = r"../stocktwits_prediction/Data/csv"
TWITS_PATH = "../Data/twits"
FINANCE_PATH = "../Data/csv"
#Define Company Names and List of Tags as Constant
COMPANY_NAMES = ["apple","boeing", "caterpillar", "cisco", "chevron", "dupont", "de", "nemours", "walt", 
                 "diseny", "facebook", "google", "vaneck", "goldman", "sachs", "ibm", "intel", "johnson",
                "jpmorgan", "coca", "cola", "coca-cola", "mcdonalds", "3m", "merck", "microsoft", "nike",
                "pfizer", "unitedhealth", "raytheon", "visa", "verizon", "walmart", 
                 "exxon", "mobil"]

TAGS = ['\$FB','\$GOOG','\$AAPL', '\$WB','\$AXP','\$BA','\$CAT','\$CSCO','\$CVX','\$DD','\$DIS','\$FB',
        '\$GE','\$GS','\$HD','\$IBM','\$INTC','\$JNJ','\$JPM','\$KO','\$MCD','\$MMM','\$MRK','\$MSFT',
        '\$NKE','\$PFE','\$PG','\$QQQ','\$SPY','\$TRV','\$UNH','\$UTX','\$V','\$VZ','\$WMT','\$XOM']

LABELS = ["Positive Return", "Negative Return"]

#Return ALl Files in Directory
all_twitter = os.listdir(TWITS_PATH)
all_finance = os.listdir(FINANCE_PATH)

#Generate List of Tags
all_tags = list()
for file_name in all_twitter:
    tag = '$' + file_name[:-4]
    all_tags.append(tag)

# stock_twits_metrics
###  - Inputs:
        - raw_data: DataFrame (can use returned DataFrame from the text parser
        - n (int): number of days (used for s_t calculation)
        - file_name (str): name of file - ex. 'FB.txt'
###  - Returns Dataframe with Following Value Types:
        - Date: DateTime object representing the a specific date in the sample period
        - bearish: total count of tweets tagged with bearish sentiment on that day
        - bullish: total count of tweets tagged with bullish sentiment on that day
        - none: total count of tweets tagged with neither bearish nor bullisn sentiment on that day
        - message_volume: total number of messages on that day 
        - mv1_t:  percentage change in message volume with one day period
        - mv10_t: percentage change in message volume with ten day period
        - polarity: (bullish - bearish)/total messages on that day
        - s_t: moving average of polarity over n days
        - company: string containing the abbreviation of the company name
###  - Additional Notes:
        - function aggregates the data into values by date - does not include the text of the orignal tweets in dataset

In [3]:
def stock_twits_metrics(raw_data, n, file_name):
    #Unwrap Sentiment Values to New Columns
    raw_data['sentiment'] = raw_data['sentiment'].apply(lambda x:{} if pd.isna(x) else x)
    sentiment_vals = pd.json_normalize(data=raw_data['sentiment'], meta=['class','name'])
    sentiment_vals = sentiment_vals.drop(columns='name')
    sentiment_vals = sentiment_vals.replace(np.nan, 'none', regex=True)

    #Convert Date Strings to DateTime Objects
    raw_data['created_at'] = pd.to_datetime(raw_data['created_at']).apply(lambda x: x.date())

    #Create New Dataframe
    temp = pd.concat([raw_data['body'], sentiment_vals, raw_data['created_at']],axis=1)
    
    #Aggregate Text From Tweets By Date
    agg_text = temp.groupby('created_at')['body'].apply(lambda x: x.sum())
    agg_text.to_frame()
    
    #Group by Unique Dates and Extract Value Counts
    message_volume = temp['created_at'].value_counts().rename_axis('dates').reset_index(name='message_volume')
    message_volume = message_volume.sort_values(by='dates').reset_index(drop=True)

    #Create Pivot Table With Desired Statistics
    temp['count'] = 1
    table = temp.pivot_table(index=['created_at'], dropna=False,
                             columns='class',values='count',
                             fill_value=0,aggfunc=np.sum)
    
    #Merge Table with Message Volume
    table = table.reset_index()
    table = pd.concat([table,message_volume['message_volume']],axis=1)

    #Calculate Metrics and Append to the DataFrame
    table['mv1_t'] = table['message_volume'].diff(periods=1).div(table['message_volume'].shift(1))
    table['mv10_t'] = table['message_volume'].div(table['message_volume'].rolling(10).mean())

    table['polarity'] = table.apply(lambda date: (date.bullish - date.bearish)/date.message_volume, axis = 1)
    table['s_t'] = table['polarity'].rolling(n).mean()
    
    #Append Company Name to DataFrame
    name = file_name[:-4]
    table['company'] = name
    
    #Append Text to DataFrame
    result = pd.merge(table, agg_text, on='created_at')
    
    #Change Column Name to Date
    result = result.rename(columns={"created_at":"Date"})
    
    result = result.dropna(how = 'any')
    return result

# stock_twits_text_parser
###  - Inputs:
        - file (str): name of file - ex. 'FB.txt'
###  - Returns Dataframe with Following Value Types:
        - df: DataFrame with all rows containing multiple tagged companies dropped, all columns included - index reset to reflect new size
###  - Additional Notes:
        - None

In [4]:
def stock_twits_text_parser(file):
    #Read File
    file_path = TWITS_PATH + '/' + file
    raw_data = pd.read_json(file_path)

    #Create New Column with List of Unique Tags
    raw_data["instances"] = (raw_data['body'].str.findall(f'(?i)({"|".join(TAGS)})')
                                .apply(lambda x: list(set(map(str.upper,x))))
                            )
    #Drop All Rows With Multiple Tags and Reset Index
    df = raw_data[raw_data['instances'].map(lambda x: len(x)) < 2]
    df = df.reset_index()
    
    return df

# feature_selector
###  - Inputs:
        - metrics (DataFrame): DataFrame containing the body text of the tweet eg. data['body'] along with other metrics
        - labels (DataFrame) DataFrame containing the target labels
        - target_label (str): name of the column containing the target label we want to use
        - min_count(int) : minimum number of occurances so that the word is converted
        - n (int): number of text features selected
        - encoding (str): default or optimized (see convert_to_binary_classification)
###  - Returns Dataframe with Following Value Types:
        - df: DataFrame with top n text features and all numerical features, raw data and labels (20 + n columns total)
###  - Additional Notes:
        - text features will always be appended after the numerical metrics, raw data and labels - currently these are the first 20 columns: use df.iloc[:,20:] to extract only text features for analysis
        - text features selected using chi squared statistics
        - ***TODO: Currently the method selects features based on categorical outcomes (classification problem) this will need to be changed to allow for regression (explore a way to bin continuous target data) ***

In [5]:
def feature_selector(metrics, labels, target_label, min_count, n, encoding):
    #Merge DataFranes
    result = pd.merge(metrics, labels, on='Date')
    
    #Remove Company Names By Adding Them To Stop Words
    new_stop_words = text.ENGLISH_STOP_WORDS.union(COMPANY_NAMES)

    #Vectorizer Parameters: Convert To Lowercase, Remove Stop Words With Company Names,Min Count 25, Laplace Smoothing
    v = TfidfVectorizer(analyzer='word', lowercase=True,stop_words=new_stop_words, min_df = min_count, smooth_idf=True)
    x = v.fit_transform(result['body'])

    #Convert back to DataFrame
    text_data = pd.DataFrame(x.toarray(),columns=v.get_feature_names())
    
    #Select n Best Features
    X = text_data
    if(X.shape[1] >= n):
        Y = convert_to_binary_classification(result[target_label], encoding)
        chi2_selector = SelectKBest(chi2, k=n)
        X_best = chi2_selector.fit_transform(X,Y)
    else:
        X_best = X
        print("Warning: found max of " + str(X.shape[1]) + " text features - all features used")
    
    data = pd.DataFrame(X_best)
    df = pd.concat([result,data],axis=1)
    
    df = df.dropna()
    
    return df

# return_over_period_T
###  - Inputs:
        - file (str): name of file - ex. 'FB.csv'
        - T (int): period based on which return is calculated (in days)
###  - Returns Dataframe with Following Value Types:
        - Date: DateTime object representing a specific date in the sample period
        - OPEN, HIGH, LOW, VOLUME, CLOSE: prices each day with respect to the specific metric
        - xxx_return: return for referenced metric over period T, where T is specified when the function is called
###  - Additional Notes:
        - function includes original data taken from the CSV file
        - formula for return over period T is defined as: r(T) = (p(t+T) - p(t))/p(t), where t is current day and p(t) i the price at the current day

In [6]:
def return_over_period_T(file, T):
    #Read File
    file_path = FINANCE_PATH + '/' + file
    df = pd.read_csv(file_path)

    #Convert Date Strings to DateTime Objects
    df['Date'] = pd.to_datetime(df['Date']).apply(lambda x: x.date())

    #Calculate Metrics for OPEN, HIGH, LOW, VOLUME and CLOSE
    df['open_return'] = -df['OPEN'].diff(periods = -T).div(df['OPEN'])
    df['high_return'] = -df['HIGH'].diff(periods = -T).div(df['HIGH'])
    df['low_return'] = -df['LOW'].diff(periods = -T).div(df['LOW'])
    df['close_return'] = -df['CLOSE'].diff(periods = -T).div(df['CLOSE'])
    
    df = df.dropna(how='any')
    
    return df

# return_over_period_T_API
###  - Inputs:
        - stock_name (str): name of stock (abbreviation) ex. Microsoft = MSFT
        - T (int): period based on which return is calculated (in days)
###  - Returns Dataframe with Following Value Types:
        - Date: DateTime object representing a specific date in the sample period
        - OPEN, HIGH, LOW, VOLUME, CLOSE: prices each day with respect to the specific metric
        - xxx_return: return for referenced metric over period T, where T is specified when the function is called
###  - Additional Notes:
        - function includes original data taken from the CSV file
        - formula for return over period T is defined as: r(T) = (p(t+T) - p(t))/p(t), where t is current day and p(t) i the price at the current day
        - public API is rate limited 

In [7]:
def return_over_period_T_API(stock_name, T):    
    #Public api (w/o auth token) - limited to 2000 requests/IP
    stock = yf.Ticker(stock_name)

    #Stops rate limiting from blocking IP
    time.sleep(1)

    #Get historical market data
    hist = stock.history(period="max")

    df = hist.reset_index()
    df = df.drop(['Dividends', 'Stock Splits'], axis=1)

    #Convert Date Strings to DateTime Objects
    df['Date'] = pd.to_datetime(df['Date']).apply(lambda x: x.date())

    #Calculate Metrics for OPEN, HIGH, LOW, VOLUME and CLOSE
    df['open_return'] = -df['Open'].diff(periods = -T).div(df['Open'])
    df['high_return'] = -df['High'].diff(periods = -T).div(df['High'])
    df['low_return'] = -df['Low'].diff(periods = -T).div(df['Low'])
    df['close_return'] = -df['Close'].diff(periods = -T).div(df['Close'])
    
    df = df.dropna(how='any')

    return df

# convert_to_binary_classification
###  - Inputs:
        - df (DataFrame): single column to be classified
        - encoding (str): tells function which encoding to use
###  - Returns Dataframe with Following Value Types:
        - Binary Labels: single column of binary labels representing positive and negative returns on the day
###  - Additional Notes:
        - Note that a negative return is defined as class 0 and a postive return or no return is defined as class 1
        - default (0,1)
        - optimized (-1,1)

In [8]:
def convert_to_binary_classification(df, encoding):
    if(encoding == "default"):
        binary_classification = df.apply(lambda x: 0 if x < 0 else 1)
    elif(encoding == "optimized"): 
        binary_classification = df.apply(lambda x: -1 if x < 0 else 1)
    else:
        print("ERROR: Input not recognized")
    return binary_classification

# split_data
###  - Inputs:
        - df (DataFrame): Processed DataFrame with features and labels to be split
        - train_percentage (float): Percentage of data to be used as training data - assumes all other data is used as test
        - features (str): toggles whether to use text only or text and numerical features (valid inputs: 'text', 'all')
        - label (str): selects target label based on column name
        - target_type (str): binary (bin) or continuous (cont)
        - encoding (str): default or optimized (see convert_to_binary_classification)
###  - Returns Dataframe with Following Value Types:
        - data_dict (Dictionary): dictionary with 4 values corresponding to X_train, X_test, Y_train, Y_test (keys)
###  - Additional Notes:
        - None

In [9]:
def split_data(df, train_percentage, features, label, target_type, encoding):
    #Determine Split Index Based on the Percentage Of Data to Be Used For Training
    train_index = int(train_percentage*len(df.index)) + 1
    
    #Split Data
    if(features == 'text'):
        X_train = df.iloc[0:train_index,20:]
        X_test = df.iloc[train_index:,20:]
    elif(features == 'all'):
        df1 = df.iloc[0:train_index:,np.r_[5:7,8]]
        df2 = df.iloc[0:train_index,20:]
        X_train = pd.concat([df1,df2],axis=1)
        
        df3 = df.iloc[train_index:,np.r_[5:7,8]]
        df4 = df.iloc[train_index:,20:]
        X_test = pd.concat([df3,df4],axis=1)
    else:
        raise ValueError('Invalid Parameter Input - features')
        
    if(target_type == 'bin' or target_type == 'cont'):
        if(target_type == 'bin'):
            Y = convert_to_binary_classification(df[label], encoding)
        else:
            Y = df[label]
        Y_train = Y.iloc[0:train_index]
        Y_test = Y.iloc[train_index:]
    else:
        raise ValueError('Invalid Parameter Input - target_type')
    
    #Return as a Dictionary
    data_dict = dict()
    
    data_dict['X_train'] = X_train
    data_dict['X_test'] = X_test
    data_dict['Y_train'] = Y_train
    data_dict['Y_test'] = Y_test
    
    return data_dict

# create_metrics_table
###  - Inputs:
        - Y_train: training labels
        - pred_train: predicted labels from training data
        - Y_test: training labels
        - pred_test: predicted labels from test data
        - valid input types (series, numpy array, DF columns) - must be column vectors
###  - Returns Dataframe with Following Value Types:
        - table (DataFrame): table with REC, PREC, F1 amd ACC for training and test predictions
###  - Additional Notes:
        - None

In [10]:
def create_metrics_table(Y_train, pred_train, Y_test, pred_test):
    #Calculate Metrics
    REC_train = recall_score(Y_train,pred_train)
    PREC_train = precision_score(Y_train, pred_train)
    F1_train = f1_score(Y_train, pred_train)
    ACC_train = accuracy_score(Y_train, pred_train)

    REC_test = recall_score(Y_test,pred_test)
    PREC_test = precision_score(Y_test,pred_test)
    F1_test = f1_score(Y_test, pred_test)
    ACC_test = accuracy_score(Y_test, pred_test)

    #Create Table
    rows = [["Training", REC_train, PREC_train, F1_train, ACC_train], ["Test", REC_test, PREC_test, F1_test,ACC_test]]
    table = pd.DataFrame(rows, columns = ["Dataset", "Recall", "Precision", "F1 Score", "Accuracy"])
    table.set_index("Dataset", inplace =True)

    return table

# plot_confusion_matrix
###  - Inputs:
        - matrix (confusion matrix): confusion matrix generated from sklearn confusion_matrix
        - data_set_name (str): name of dataset ex. 'training' or 'test' 
###  - Returns Dataframe with Following Value Types:
        - None - void type function
        - Displays plot
###  - Additional Notes:
        - Wrapper for seaborn plot code

In [11]:
def plot_confusion_matrix(matrix, data_set_name):
    sns.heatmap(matrix, annot=True, fmt= ".3f", xticklabels = LABELS, yticklabels = LABELS)
    plt.xlabel("Predicted Label")
    plt.ylabel("Actual Label")
    plt.title("Confusion Matrix for " + data_set_name + " Set")

# plot_ROC_curve
###  - Inputs:
        - FPR: False Positive Rate - Generated from sklearn roc_curve
        - TPR: True Positive Rate - Generated from sklearn roc_curve
        - data_set_name (str): name of dataset ex. 'training' or 'test'
###  - Returns Dataframe with Following Value Types:
        - None - void type function
        - Displays plot
###  - Additional Notes:
        - Wrapper for matplotlib plot code

In [12]:
def plot_ROC_curve(FPR, TPR, data_set_name):
    plt.plot([0,1],[0,1],'k--')
    plt.plot(FPR, TPR)
    plt.xlabel("False Positive")
    plt.ylabel("True Positive")
    plt.title("ROC Curve for " + data_set_name + " Data")
    plt.show()

# action_signal_mapping

## - Inputs:
    - Y_train: training data
    - Y_test: test data
    - pred_train: prediction on training data
    - pred_test: prediction on test data
    - threshold: threshold to map action signals to

## - Outputs
    - action_signal_train: training data mapped to action signals
    - action_signal_test: test data mapped to action signals
    - action_signal_predict_train: prediction on training data mapped to action signals
    - action_signal_predict_test: prediction on test data mapped to action signals

## - Additional Notes
    - For the action signals, a "positive" action signal is any value greater than the threshold, and is mapped to 1. A mapping of zero is if the value is less than the threshold, but greater than the negative of the threshold, and is a "no action" signal. A mapping of -1 is if the value is less than the negative of the threshold, and is a "negative" action signal.

In [13]:
def action_signal_mapping(Y_train, Y_test, pred_train, pred_test, threshold):


    threshold = .001


    action_signal_train = np.copy(Y_train)
    action_signal_train[action_signal_train < -1*threshold] = -1
    action_signal_train[action_signal_train > threshold] = 1
    action_signal_train[np.abs(action_signal_train) <= threshold] = 0

    action_signal_test = np.copy(Y_test)
    action_signal_test[action_signal_test < -1*threshold] = -1
    action_signal_test[action_signal_test > threshold] = 1
    action_signal_test[np.abs(action_signal_test) <= threshold] = 0

    action_signal_predict_train = np.copy(pred_train)
    action_signal_predict_train[action_signal_predict_train < -1*threshold] = -1
    action_signal_predict_train[action_signal_predict_train > threshold] = 1
    action_signal_predict_train[np.abs(action_signal_predict_train) <= threshold] = 0

    action_signal_predict_test = np.copy(pred_test)
    action_signal_predict_test[action_signal_predict_test < -1*threshold] = -1
    action_signal_predict_test[action_signal_predict_test > threshold] = 1
    action_signal_predict_test[np.abs(action_signal_predict_test) <= threshold] = 0

    return action_signal_train, action_signal_test, action_signal_predict_train, action_signal_predict_test

# metric_mapping_action_signals

## - Inputs
     - action_signal_train: training data mapped to action signals
     - action_signal_test: test data mapped to action signals
     - action_signal_predict_train: prediction of training data mappped to action signals
     - action_signal_predict_test: prediciton of test data mapped to action signals

## - Outputs
     - action_signal_train_metrics: training data mapped to action signals with "no action" signals removed
     - action_signal_test_metrics: test data mapped to action signals with "no action" signals removed
     - action_signal_predict_train_metrics: prediction of training data mappped to action signals with "no action" signals removed
     - action_signal_predict_test_metrics: prediciton of test data mapped to action signals with "no action" signals removed

## Additional notes:
    - for the measurement of different metrics such as accuracy, precision, recall etc., we drop the all cases where a "no action" signal is predicted and the truth was "positive" or "negative" action. We also drop the reverse scenario, where the truth was "no action", and the prediction was "positive" or "negative" action. 

In [14]:
def metrics_mapping_action_signals(action_signal_train, action_signal_test, action_signal_predict_train, action_signal_predict_test):
    action_signal_train_metrics = np.empty((action_signal_train.shape))
    action_signal_test_metrics = np.empty((action_signal_test.shape))
    action_signal_predict_train_metrics = np.empty((action_signal_predict_train.shape))
    action_signal_predict_test_metrics = np.empty((action_signal_predict_test.shape))

    train_len = action_signal_train.shape[0]
    test_len = action_signal_test.shape[0]

    for i in range(0, train_len):
        if (np.abs(action_signal_train[i] + action_signal_predict_train[i]) == 1) or (action_signal_train[i] == 0 and action_signal_predict_train[i] == 0):
            action_signal_train_metrics[i] = 0
            action_signal_predict_train_metrics[i] = 0
        else:
            action_signal_train_metrics[i] = action_signal_train[i]
            action_signal_predict_train_metrics[i] = action_signal_predict_train[i]



    for j in range(0, test_len):
        if (np.abs(action_signal_test[j] + action_signal_predict_test[j]) == 1) or (action_signal_test[j] == 0 and action_signal_predict_test[j] == 0):
            action_signal_test_metrics[j] = 0
            action_signal_predict_test_metrics[j] = 0
        else:
            action_signal_test_metrics[j] = action_signal_test[j]
            action_signal_predict_test_metrics[j] = action_signal_predict_test[j]
            
    
        
    action_signal_train_metrics =  action_signal_train_metrics[action_signal_train_metrics != 0]
    action_signal_test_metrics =  action_signal_test_metrics[action_signal_test_metrics != 0]
    action_signal_predict_train_metrics = action_signal_predict_train_metrics[action_signal_predict_train_metrics != 0]
    action_signal_predict_test_metrics = action_signal_predict_test_metrics[action_signal_predict_test_metrics != 0]
    

    return action_signal_train_metrics, action_signal_test_metrics, action_signal_predict_train_metrics, action_signal_predict_test_metrics

# get_best_SVC
###  - Inputs:
        - model_type (str): toggles between selecting the best linear model or choosing from all types - valid inputs ('all', 'linear')
        - alphas (list): list containing the distinct values of the regularization parameter to be chosen between
        - folds (int): number of folds used for cross validation
        - X_train (DataFrame): training feature data
        - Y_train (DataFrame): training label data
###  - Returns Dataframe with Following Value Types:
        - best_model (estimator object): returns estimator selected based on the given parameters
###  - Additional Notes:
        - prints the parameters selected to generate the best model

In [15]:
def get_best_SVC(model_type, alphas, folds, X_train, Y_train, verbose):
    if(model_type == 'linear'):
        lin_svc = LinearSVC(dual=False ,max_iter=500000)
        params = {'penalty':['l1','l2'],'C':alphas}
        clf = GridSearchCV(estimator=lin_svc, param_grid=params, cv=folds)
        clf.fit(X_train,Y_train)
        
        if(verbose == 1):
            print("Best Parameters: ")
            print(clf.best_params_)
        
        best_model = clf.best_estimator_
    elif(model_type == 'all'):
        svc = SVC(max_iter=500000)
        params = {'C':alphas, 'kernel':['linear','poly','sigmoid','rbf']}
        clf = GridSearchCV(estimator=svc, param_grid=params, cv=folds)
        clf.fit(X_train,Y_train)
        
        if(verbose == 1):
            print("Best Parameters: ")
            print(clf.best_params_)
        
        best_model = clf.best_estimator_
    else:
        raise ValueError('Invalid Parameter Input - model_type')
    return best_model

# get_best_knn_regressor
###  - Inputs:
        - lower_bound_k (int): lower bound of k values being selected from (inclusive)
        - upper_bound_k (int): upper bound of k values being selected from (inclusive)
        - step_size (int): step size between values of k in specified range [lower_bound_k, upper_bound_k]
        - folds (int): number of folds used for cross validation
        - X_train (DataFrame): training feature data
        - Y_train (DataFrame): training label data
###  - Returns Dataframe with Following Value Types:
        - best_model (estimator object): returns estimator selected based on the given parameters
###  - Additional Notes:
        - prints the parameters selected to generate the best model

In [16]:
def get_best_knn_regressor(lower_bound_k, upper_bound_k, step_size, folds, X_train, Y_train):
    #Create List of Possible K Values
    k_vals = list()
    while(lower_bound_k <= upper_bound_k):
        k_vals.append(lower_bound_k)
        lower_bound_k+=step_size
    
    #Define Parameters to Cross-Validate
    params = {'n_neighbors':k_vals,'weights':['uniform','distance'],'p':[1,2,3]}
    
    #Select Best Parameters
    knn = KNeighborsRegressor()

    knn_reg = GridSearchCV(estimator=knn, param_grid=params, cv=folds)
    knn_reg.fit(X_train,Y_train)
        
    print("Best Parameters: ")
    print(knn_reg.best_params_)
        
    best_model = knn_reg.best_estimator_
    
    return best_model

# get_best_SVR
###  - Inputs:
        - model_type (str): toggles between selecting the best linear model or choosing from all types - valid inputs ('all', 'linear')
        - alphas (list): list containing the distinct values of the regularization parameter to be chosen between
        - degrees (list) - different degrees of the polynomial kernel function to be chosen from if 'poly' is selected. Ignored by all other kernels 
        - folds (int): number of folds used for cross validation
        - X_train (DataFrame): training feature data
        - Y_train (DataFrame): training label data
###  - Returns Dataframe with Following Value Types:
        - best_model (estimator object): returns estimator selected based on the given parameters
###  - Additional Notes:
        - prints the parameters selected to generate the best model

In [17]:
def get_best_SVR(model_type, alphas, degrees, folds, X_train, Y_train):
    if(model_type == 'linear'):
        lin_svr = LinearSVR(dual = False, max_iter=500000)
        params = {'loss':['squared_epsilon_insensitive'],'C':alphas, 'fit_intercept': [True, False]}
        clf = GridSearchCV(estimator=lin_svr, param_grid=params, cv=folds)
        clf.fit(X_train,Y_train)
        
        print("Best Parameters: ")
        print(clf.best_params_)
        
        best_model = clf.best_estimator_
    elif(model_type == 'all'):
        svr = SVR(max_iter=500000)
        params = {'C':alphas, 'kernel':['linear','poly','sigmoid','rbf'], 'degree': degrees }   #'gamma': ['scale','auto']
        clf = GridSearchCV(estimator=svr, param_grid=params, cv=folds)
        clf.fit(X_train,Y_train)
        
        print("Best Parameters: ")
        print(clf.best_params_)
        print("Ignore degree value if kernel is not 'poly' ")
        best_model = clf.best_estimator_
    else:
        raise ValueError('Invalid Parameter Input - model_type')
    return best_model

# build_neural_network_cf
###  - Inputs:
        - features (int): number of features - default none (will throw error if int not provided)
###  - Returns Dataframe with Following Value Types:
        - model (keras estimator): compiled estimator specified by the function
###  - Additional Notes:
        - Neural Network strucutre needs to be tuned by hand - function does not handle this"

In [18]:
def build_neural_network_cf(features=None):
    #Build Model
    model = Sequential()
    
    #Input Layer
    model.add(Dense(64, input_dim=features, activation='relu'))
    
    #Hidden Layers
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    
    #Output Layers
    model.add(Dense(1, activation='sigmoid'))
              
    #Compile Model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
              
    return model

# build_neural_network_reg
###  - Inputs:
        - features (int): number of features - default none (will throw error if int not provided)
###  - Returns Dataframe with Following Value Types:
        - model (keras estimator): compiled estimator specified by the function
###  - Additional Notes:
        - Neural Network strucutre needs to be tuned by hand - function does not handle this"

In [19]:
def build_neural_network_reg(features=None):
    #Build Model
    model = Sequential();

    #Input Layer
    model.add(Dense(64, input_dim=features, activation='relu'))

    #Hidden Layers
    model.add(Dense(64, activation='relu'))
    model.add(Dense(64, activation='relu'))

    #Output Layer
    model.add(Dense(1, activation='linear'))

    #Compile Model
    model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
    
    return model

 # all_models_cf
 ###  - Inputs:
        - file_name (str): name of file - ex. 'FB.txt"
        - csv_name (str): name of csv - ex. 'FB.csv'
        - features (int): number of text features to include
        - verbose (int): 0 to hide messages, 1 to show messages
###  - Returns Dataframe with Following Value Types:
        - result (list): list containing the name of the dataset, best model type, corresponding accuracy, and the fitted model
###  - Additional Notes:
        - none

In [20]:
def all_models_cf(file_name, csv_name, features, verbose):
    all_features = features+3
    name = file_name[:-4]

    company = stock_twits_text_parser(file_name)
    company_metrics = stock_twits_metrics(company, 3, file_name)
    company_return = return_over_period_T(csv_name,3)
    agg_data = feature_selector(company_metrics, company_return, 'close_return', 25, features, 'default')

    if(features > agg_data.iloc[:,20:].shape[1]):
        all_features =agg_data.iloc[:,20:].shape[1] + 3
        
    company_data = split_data(agg_data, 0.7, 'all', 'close_return', 'bin', "default")
    
    #Split
    X_train = company_data['X_train']
    Y_train = company_data['Y_train']
    X_test = company_data['X_test']
    Y_test = company_data['Y_test']

    reg_vals = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4, 1e5]

    clf_lin_svc =  get_best_SVC('linear', reg_vals, 5, X_train, Y_train,0)

    model = KerasClassifier(build_fn=build_neural_network_cf, features=all_features, epochs=100, batch_size=10, verbose=0)

    best_accuracy = clf_lin_svc.score(X_test,Y_test)
    best_classifier = 0

    l1 = Pipeline([('clf_l1',LogisticRegressionCV(Cs=[1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4, 1e5], cv = 5, penalty = "l1", solver = 'liblinear'))])
    l2 = Pipeline([('clf_l2',RidgeClassifierCV(alphas=[1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4, 1e5], cv = 5))])
    rf = Pipeline([('clf_rf',RandomForestClassifier())])
    et = Pipeline([('clf_et',ExtraTreesClassifier())])
    nn = Pipeline([('clf_nn',model)])
    
    pipe_dict = {0:'SVC',1:'L1',2:"L2",3:"Random Forest",4:"Extra Trees",5:"Neural Network"}
    pipelines =[l1,l2,rf,et,nn]

    for pipe in pipelines:
        pipe.fit(X_train, Y_train)

    if(verbose == 1):
        print("SVC Test Accuracy: " + str(best_accuracy))
        for i, model in enumerate(pipelines):
            print("{} Test Accuracy: {}".format(pipe_dict[i+1],model.score(X_test,Y_test)))
            
    for i, model in enumerate(pipelines):
        if(model.score(X_test, Y_test)>best_accuracy):
            best_accuracy = model.score(X_test,Y_test)
            best_classifier = i+1
    
    if best_classifier == 0: # return SVC as best model
        best_model = clf_lin_svc
    else:
        best_model = pipelines[best_classifier - 1] # return best model from pipelines
    
    print("Best Model: {}".format(pipe_dict[best_classifier]) )
    result = [name, pipe_dict[best_classifier], best_accuracy, best_model]
    
    return result

# all_datasets_cf
###  - Inputs:
        - companies (list) - list of company abbreviations eg. FB, AXP, CAT
        - n_features (int) - number of text features to include
###  - Returns Dataframe with Following Value Types:
        - table (DataFrame) - table of results containing the name of each stock, best estimator and the corresponding accuracy
###  - Additional Notes:\n",
        - none

In [21]:
def all_datasets_cf(companies, n_features):
    company_performance = list()
    for company in companies:
        file_name = company + '.txt'
        csv = company + '.csv'
        features = n_features
        result = all_models_cf(file_name, csv, features, 0)
        company_performance.append(result)

    #Create Table
    rows = company_performance
    table = pd.DataFrame(rows, columns = ["Dataset", "Best Model", "Test Score"])
    table.set_index("Dataset", inplace =True)
                    
    return table

# trader_single_stock
###  - Inputs:
        - predictions (numpy array): array of label predictions
        - period (int): time in days of when we are predicting the return for
        - metric (str): the label of the price being compared ex. 'close'
        - numerical_data (DataFrame): use output of return_over_period_T function
        - capital (int): initial amount of money for simulation
        - index (int) start index of test set
###  - Returns Dataframe with Following Value Types:
        - capital (int): amount of money we have at the end of the testing period
###  - Additional Notes:
        - to make the function more modular we can use len(X_test) for index inputs until there is a better solution for slicing (would be nice to eliminate this parameter)

In [22]:
def trader_single_stock(predictions, period, metric, numerical_data, capital, index):
    target = metric.lower() + '_return'
    
    r = numerical_data[target]
    price = numerical_data[metric.upper()]
    
    labels = r.iloc[index:].values
    price_labels = price.iloc[index:].values
    
    action = deque()
    investment = deque()
    
    for i in range (len(predictions)):
        #assume we always invest 1/3 of our available cash
        cash = capital/3
        
        #too poor to buy stock :(
        if(cash < price[i]):
            continue

        investment.append(cash)
        capital-=cash
        
        if(i < period):
            action.append(predictions[i])
        else:
            #make new prediction for t+T
            action.append(predictions[i])
        
            #calculate return based on prediction
            prediction = action.popleft()
            past_investment = investment.popleft()
            
            if(prediction == 1):
                #calculate long return
                delta = labels[i-3]
            elif(prediction == -1):
                #calculate short return
                delta = -labels[i-3]
            else:
                #did not invest T days ago
                delta = 0
            #calculate profit and update capital value
            shares = past_investment/price_labels[i-3]
            profit = (delta+1)*past_investment - 0.0075*shares
            capital+=profit
        #print(capital)
        
    for i in range(period):
        capital+=investment.popleft()
        
    return capital

# trade_portfolio

###  - Inputs:
        - stock_portfolio: list of stock ticker names that will be traded
        - algo_of_choice: set to default as Linear SVM for now, will be integrated for any algo in the future. if set to 'optimal', best algorithm for each stock is used using all_models_cf function
        - capital: amount of capital to invest, will be distributed evenly amongst the stocks
        - metric (str): the label of the price being compared ex. 'close'
        - num_features: number of chi squared features to use
        - period: period of T day return
###  - Returns Dataframe with Following Value Types:
        - all_company_capital (int): amount of money we have at the end of the testing period for each stock in our portfolio
###  - Additional Notes:
        - will be integrated to run any algo in future updates

In [None]:
def trade_portfolio(stock_portfolio, algo_of_choice , capital, metric, num_features, period ):
    
    num_stocks = len(stock_portfolio)
    # all_company_returns = {}
    # all_company_preds = {}                   
    all_company_capital = {}
    
    for stock in stock_portfolio:

        file_name = stock + '.json'
        csv_name = stock + '.csv'
        
        data =  return_cleaned_train_test_labels_split(stock, encoding = 'default')
        X_train = data[0]
        Y_train = data[1]
        X_test = data[2]
        Y_test = data[3]
        company_return = data[4]
        #Define Values for Regularization Parameter
        reg_vals = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4, 1e5]
        
        
        if algo_of_choice  == 'Linear SVM':
            #Find Best Model
            clf_lin_svc =  get_best_SVC('linear', reg_vals, 5, X_train, Y_train)

            #Predict
            pred_test = clf_lin_svc.predict(X_test)
        
        if algo_of_choice == 'optimal':
            optimal_model = all_models_cf(file_name, csv_name, features = num_features, verbose = 1)
            model = optimal_model[3]
            pred_test = model.predict(X_test)
            #print(pred_test)
        
        
        #all_company_returns[stock] = company_return
        #all_company_preds[stock] = pred_test
        all_company_capital[stock] = trader_single_stock(pred_test, period, metric, company_return, capital/num_stocks, len(X_train))
    
    return all_company_capital
        

# return_cleaned_train_test_labels_split

### - Inputs:
        - stock: ticker name of the stock
        - encoding: default or optimized (see convert_to_binary_classification)
### - Returns:
        - data: array with format [X_train, Y_train, X_test, Y_test, company_return]
    

In [None]:
def return_cleaned_train_test_labels_split(stock, encoding):
        
    file_name = stock + '.json'
    csv_name = stock + '.csv'
    #Features
    company = stock_twits_text_parser(file_name)
    company_metrics = stock_twits_metrics(company, period, file_name)

    #Labels
    company_return = return_over_period_T(csv_name,period)

    #Cleaned Data
    agg_data = feature_selector(company_metrics, company_return, 'close_return', 25, num_features, encoding)
    company_data = split_data(agg_data, 0.7, 'all', 'close_return', 'bin', encoding)

    #Split
    X_train = company_data['X_train']
    Y_train = company_data['Y_train']
    X_test = company_data['X_test']
    Y_test = company_data['Y_test']

    data = [X_train, Y_train, X_test, Y_test, company_return]
    
    return data




# Testing

In [23]:
'''
file_name = 'FB.json'
csv_name = 'FB.csv'

company = stock_twits_text_parser(file_name)
company_metrics = stock_twits_metrics(company, 3, file_name)
company_return = return_over_period_T(csv_name,3)
agg_data = feature_selector(company_metrics, company_return, 'close_return', 25, 500)


'''

"\nfile_name = 'FB.json'\ncsv_name = 'FB.csv'\n\ncompany = stock_twits_text_parser(file_name)\ncompany_metrics = stock_twits_metrics(company, 3, file_name)\ncompany_return = return_over_period_T(csv_name,3)\nagg_data = feature_selector(company_metrics, company_return, 'close_return', 25, 500)\n\n\n"

In [24]:
#company_data = split_data(agg_data, 0.7, 'all', 'close_return', 'bin')


In [25]:
#company_data