In [1]:
import findspark
findspark.init()
findspark.find()
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import length, col
from pyspark import SparkContext
sc = SparkContext()
sql = SQLContext(sc)

In [2]:
import numpy as np
import pandas as pd
import yfinance as yf
import pyspark.ml as ml
from functools import reduce
import pandas_datareader as pdr
import matplotlib.pyplot as plt
from pyspark.sql.types import *
from pyspark.ml.feature import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import  BinaryClassificationMetrics,MulticlassMetrics
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# Import data

In [3]:
# Financial Data
AAPL_finance = sql.read.csv("AAPL_stock", inferSchema = True,header = True)

# News sentement data
AAPL_news = sql.read.csv("AAPL_Sentiment.csv", inferSchema = True,header = True)

# Join two dataframes
AAPL_df = AAPL_finance.join(AAPL_news, "Date")

## Date pre-processing

### Feature correlation

In [4]:
# Creating features vector,lable column and handling categorical variables(uptrend =0, Downtrend =1)
formula = RFormula(formula="tomorrow_trend ~. - Date - Symbol + RSI:SMA:K +Close:Open:Volume:Low:High",
                   featuresCol="features",
                   labelCol="label",
                  stringIndexerOrderType = 'frequencyAsc')

# Merge lable and features vector in one vector
assembler = VectorAssembler(inputCols=['label','features'], outputCol='vector_col')

#fit_transform the data:
corr_pipe = ml.Pipeline(stages = [formula,assembler])
corr_vector = corr_pipe.fit(AAPL_df).transform(AAPL_df)

# Pairwise correlation matrix between attributes
corr_matrix = Correlation.corr(corr_vector,'vector_col').collect()[0][0].toArray()

# correlation matrix plot 
attr = ['tomorrow_trend','High','Low','Open','Close','Volume','Adj_close','today_trend','RSI','SMA','K', 'sentement_score',"1","2"] 
corr = pd.DataFrame(corr_matrix,columns =attr, index= attr )
corr.style.background_gradient(cmap= 'PuBu')

Unnamed: 0,tomorrow_trend,High,Low,Open,Close,Volume,Adj_close,today_trend,RSI,SMA,K,sentement_score,1,2
tomorrow_trend,1.0,0.0224568,0.0203068,0.020329,0.023035,0.0567244,0.0224468,-0.049113,-0.0231841,0.0287897,-0.0279375,-0.00135558,-0.00817548,0.0388909
High,0.0224568,1.0,0.999019,0.999451,0.999376,-0.0296736,0.999286,0.00793026,0.13924,0.991033,0.121224,0.0973353,0.54345,0.774303
Low,0.0203068,0.999019,1.0,0.999424,0.999411,-0.0604965,0.999118,0.0055263,0.155363,0.987041,0.139464,0.102632,0.561876,0.757285
Open,0.020329,0.999451,0.999424,1.0,0.998991,-0.0445214,0.998799,0.0211638,0.1478,0.989198,0.124634,0.101452,0.549744,0.764924
Close,0.023035,0.999376,0.999411,0.998991,1.0,-0.0467617,0.999781,-0.00738663,0.146913,0.988661,0.137329,0.100455,0.556932,0.765591
Volume,0.0567244,-0.0296736,-0.0604965,-0.0445214,-0.0467617,1.0,-0.0438824,0.0717553,-0.346347,0.0198874,-0.378453,0.00653634,-0.335165,0.33805
Adj_close,0.0224468,0.999286,0.999118,0.998799,0.999781,-0.0438824,1.0,-0.00692998,0.142594,0.989183,0.133746,0.099931,0.554007,0.766618
today_trend,-0.049113,0.00793026,0.0055263,0.0211638,-0.00738663,0.0717553,-0.00692998,1.0,-0.00231647,0.0203286,-0.281183,0.036468,-0.144625,0.0109771
RSI,-0.0231841,0.13924,0.155363,0.1478,0.146913,-0.346347,0.142594,-0.00231647,1.0,0.0459379,0.749765,0.0572584,0.767562,-0.00283556
SMA,0.0287897,0.991033,0.987041,0.989198,0.988661,0.0198874,0.989183,0.0203286,0.0459379,1.0,0.0175044,0.0741822,0.446766,0.792776


### Building ML pipeline

In [5]:
# split the data, %80 train %20 test
train , test = AAPL_df.randomSplit([0.8, 0.2], seed = 1996)

print("Number or rows in train set: {} \nNumber of rows is test set: {}".format(train.count(),test.count()))

Number or rows in train set: 870 
Number of rows is test set: 194


In [6]:
""" rformula does the following:
    1) Seperate the data columns into label column and features vectore
    2) Drop highly correlated features
    3) One-hot-encoding (uptrend =0, Downtrend =1)
    """
rformula = RFormula(formula="tomorrow_trend ~. -Date -Symbol -Low -Open -Adj_close +RSI:SMA",
                   featuresCol="features",
                   labelCol="label",
                  stringIndexerOrderType = 'frequencyAsc')


# standardization
# scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures", min= -1, max=1 )



#Building pipeline
ml_pipe = ml.Pipeline(stages =[rformula, scaler])


# Preparing train and test data to be fed to ML model
ml_model = ml_pipe.fit(train)
ml_train = ml_model.transform(train).select("label","scaledFeatures")
ml_test = ml_model.transform(test).select("label","scaledFeatures")

In [7]:
ml_train.show(5)
ml_test.show(5)

+-----+--------------------+
|label|      scaledFeatures|
+-----+--------------------+
|  1.0|[-0.9234132527650...|
|  0.0|[-0.9191678696742...|
|  0.0|[-0.9407344235491...|
|  1.0|[-0.9446401682191...|
|  0.0|[-0.9472723497854...|
+-----+--------------------+
only showing top 5 rows

+-----+--------------------+
|label|      scaledFeatures|
+-----+--------------------+
|  0.0|[-0.9372532430999...|
|  1.0|[-0.9251963058896...|
|  1.0|[-0.9487157644892...|
|  1.0|[-0.9657822226526...|
|  0.0|[-0.9759711161588...|
+-----+--------------------+
only showing top 5 rows



### Define evaluation metrics

In [8]:
# Accuracy Evaluater
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [9]:
# Confusion matrix
def conf_matrix(df):
    matrix_cof = MulticlassMetrics(df.select("prediction", "label") \
                                           .rdd.map(lambda x: (float(x[0]),float(x[1]))))
    print("model's Accuracy: {} \nmodel's precision(Uptrend): {} \nmodel's precision(Downtrend): {}  \nmodel's Recall(Uptrend): {} \nmodel's Recall(Downtrend): {} \nmodel's f1_score(Uptrend): {} \nmodel's f1_score(Downtrend): {}" \
          .format(matrix_cof.accuracy, matrix_cof.precision(0.0), matrix_cof.precision(1.0), 
                 matrix_cof.recall(0.0), matrix_cof.recall(1.0), matrix_cof.weightedFMeasure(0.0), matrix_cof.weightedFMeasure(1.0)))
    print(matrix_cof.confusionMatrix().toArray())

# Run ML models
### All models will be trained by grid search with 10-fold-cross_validation

### Logistic Regression

In [10]:
#Logistic regression model
lr = LogisticRegression(featuresCol='scaledFeatures', labelCol='label')

# Hyperparameter grid
params_lr = ParamGridBuilder() \
            .addGrid(lr.elasticNetParam, [0, 0.25, 0.5, 0.75, 1]) \
            .addGrid(lr.regParam, [0.001, 0.01, 0.1, 1, 10]) \
            .addGrid(lr.fitIntercept, [True, False]) \
            .build()

# 10-fold cross validation
lr_cv = CrossValidator(estimator=lr,
                          estimatorParamMaps=params_lr,
                          evaluator= accuracy_evaluator,
                          numFolds=10,
                          seed=1999,
                          parallelism=4) 


# Training the model
lr_model = lr_cv.fit(ml_train)

# # The cross validation accuracy for each model:
# print("The avg accuracy of each model: {}".format(lr_model.avgMetrics))

# Best model params:
print(lr_model.bestModel.explainParam('elasticNetParam'),lr_model.bestModel.explainParam('regParam'),
      lr_model.bestModel.explainParam('fitIntercept'), sep ='\n')


elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0, current: 0.5)
regParam: regularization parameter (>= 0). (default: 0.0, current: 0.01)
fitIntercept: whether to fit an intercept term. (default: True, current: True)


In [11]:
# Train metices
lr_train = lr_model.transform(ml_train)
conf_matrix(lr_train)

model's Accuracy: 0.5517241379310345 
model's precision(Uptrend): 0.5483028720626631 
model's precision(Downtrend): 0.5769230769230769  
model's Recall(Uptrend): 0.9051724137931034 
model's Recall(Downtrend): 0.1477832512315271 
model's f1_score(Uptrend): 0.5616589676641897 
model's f1_score(Downtrend): 0.4740315638450502
[[420.  44.]
 [346.  60.]]


In [12]:
# Test the model
lr_test = lr_model.transform(ml_test)
conf_matrix(lr_test)

model's Accuracy: 0.5773195876288659 
model's precision(Uptrend): 0.5808383233532934 
model's precision(Downtrend): 0.5555555555555556  
model's Recall(Uptrend): 0.8899082568807339 
model's Recall(Downtrend): 0.17647058823529413 
model's f1_score(Uptrend): 0.5697608219986144 
model's f1_score(Downtrend): 0.5122876245971271
[[97. 12.]
 [70. 15.]]


### Random Forest

In [13]:
#Random Forest model
RF = RandomForestClassifier(featuresCol='scaledFeatures', labelCol='label')

# Hyperparameter grid
params_RF = ParamGridBuilder() \
            .addGrid(RF.maxDepth, [2, 3, 4]) \
            .addGrid(RF.numTrees, [5, 8, 10, 12]) \
            .addGrid(RF.minInstancesPerNode, [1, 2, 3, 4]) \
            .addGrid(RF.minInfoGain, [0 ,1, 2]) \
            .addGrid(RF.featureSubsetStrategy,["auto", "all", "sqrt", "log2"]) \
            .addGrid(RF.impurity, ["entropy","gini"]) \
            .build()
    
# 10-fold cross validation
RF_cv = CrossValidator(estimator=RF,
                          estimatorParamMaps=params_RF,
                          evaluator= accuracy_evaluator,
                          numFolds=5,
                          seed=1999,
                          parallelism=10) 

# Training the model
RF_model = RF_cv.fit(ml_train)

# # The cross validation accuracy for each model:
# print("The avg accuracy of each model: {}".format(RF_model.avgMetrics))

# # # Best model params:
print(RF_model.bestModel.explainParam('maxDepth'),RF_model.bestModel.explainParam('numTrees'),
      RF_model.bestModel.explainParam('minInstancesPerNode'), RF_model.bestModel.explainParam('minInfoGain'),
      RF_model.bestModel.explainParam('featureSubsetStrategy'), RF_model.bestModel.explainParam('impurity'), sep ='\n')

maxDepth: Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. (default: 5, current: 2)
numTrees: Number of trees to train (>= 1). (default: 20, current: 5)
minInstancesPerNode: Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1. (default: 1, current: 1)
minInfoGain: Minimum information gain for a split to be considered at a tree node. (default: 0.0, current: 1.0)
featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use log2(number of features)), 'n' (when n

In [14]:
# Train metices
RF_train = RF_model.transform(ml_train)
conf_matrix(RF_train)

model's Accuracy: 0.5333333333333333 
model's precision(Uptrend): 0.5333333333333333 
model's precision(Downtrend): 0.0  
model's Recall(Uptrend): 1.0 
model's Recall(Downtrend): 0.0 
model's f1_score(Uptrend): 0.28444444444444444 
model's f1_score(Downtrend): 0.37101449275362325
[[464.   0.]
 [406.   0.]]


In [15]:
# Test the model
RF_test = RF_model.transform(ml_test)
conf_matrix(RF_test)

model's Accuracy: 0.5618556701030928 
model's precision(Uptrend): 0.5618556701030928 
model's precision(Downtrend): 0.0  
model's Recall(Uptrend): 1.0 
model's Recall(Downtrend): 0.0 
model's f1_score(Uptrend): 0.3156817940269954 
model's f1_score(Downtrend): 0.4042393930114661
[[109.   0.]
 [ 85.   0.]]


### Gradient Boosting Machine

In [16]:
#Gradient Boosting model
GBM = GBTClassifier(featuresCol='scaledFeatures', labelCol='label')

# Hyperparameter grid
params_GBM = ParamGridBuilder() \
            .addGrid(GBM.maxDepth, [1, 2, 4]) \
            .addGrid(GBM.minInstancesPerNode, [1, 2, 3]) \
            .addGrid(GBM.minInfoGain, [0,1,2]) \
            .addGrid(GBM.stepSize, [0.1, 0.5, 1]) \
            .build()

# 10-fold cross validation
GBM_cv = CrossValidator(estimator=GBM,
                          estimatorParamMaps=params_GBM,
                          evaluator= accuracy_evaluator,
                          numFolds=10,
                          seed=1996,
                          parallelism=10) 
# Training the model
GBM_model = GBM_cv.fit(ml_train)

# # The cross validation accuracy for each model:
# print("The avg accuracy of each model: {}".format(RF_model.avgMetrics))

# Best model params:
print(GBM_model.bestModel.explainParam('maxDepth'),GBM_model.bestModel.explainParam('minInstancesPerNode'),
      GBM_model.bestModel.explainParam('minInfoGain'), GBM_model.bestModel.explainParam('featureSubsetStrategy'), 
      GBM_model.bestModel.explainParam('stepSize'), sep ='\n')

maxDepth: Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. (default: 5, current: 1)
minInstancesPerNode: Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1. (default: 1, current: 1)
minInfoGain: Minimum information gain for a split to be considered at a tree node. (default: 0.0, current: 1.0)
featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use log2(number of features)), 'n' (when n is in the range (0, 1.0], use n * number of features. When n is in t

In [17]:
# Train metices
GBM_train = GBM_model.transform(ml_train)
conf_matrix(GBM_train)

model's Accuracy: 0.5333333333333333 
model's precision(Uptrend): 0.5333333333333333 
model's precision(Downtrend): 0.0  
model's Recall(Uptrend): 1.0 
model's Recall(Downtrend): 0.0 
model's f1_score(Uptrend): 0.28444444444444444 
model's f1_score(Downtrend): 0.37101449275362325
[[464.   0.]
 [406.   0.]]


In [18]:
# Test the model
GBM_test = GBM_model.transform(ml_test)
conf_matrix(GBM_test)

model's Accuracy: 0.5618556701030928 
model's precision(Uptrend): 0.5618556701030928 
model's precision(Downtrend): 0.0  
model's Recall(Uptrend): 1.0 
model's Recall(Downtrend): 0.0 
model's f1_score(Uptrend): 0.3156817940269954 
model's f1_score(Downtrend): 0.4042393930114661
[[109.   0.]
 [ 85.   0.]]
