In [1]:
import findspark
findspark.init()
findspark.find()
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import length, col
from pyspark import SparkContext
sc = SparkContext()
sql = SQLContext(sc)

In [2]:
import numpy as np
import pandas as pd
import yfinance as yf
import pyspark.ml as ml
from functools import reduce
import pandas_datareader as pdr
import matplotlib.pyplot as plt
from pyspark.sql.types import *
from pyspark.ml.feature import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import  BinaryClassificationMetrics,MulticlassMetrics
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# Import data

In [3]:
# Financial Data
AMZN_finance = sql.read.csv("AMZN_stock", inferSchema = True,header = True)

# News sentement data
AMZN_news = sql.read.csv("AMZN_Sentiment.csv", inferSchema = True,header = True)

# Join two dataframes
AMZN_df = AMZN_finance.join(AMZN_news, "Date")

In [38]:
AMZN_df.drop("Symbol","Date","Low","Open", "Adj_Close").toPandas()

Unnamed: 0,High,Close,Volume,today_trend,tomorrow_trend,RSI,SMA,K,sentement_score
0,625.989990,617.890015,4724100.0,Downtrend,Downtrend,26.310611,623.683997,32.662754,11.2974
1,620.880005,581.809998,7655200.0,Downtrend,Uptrend,26.581704,622.693329,3.373249,17.8131
2,602.250000,593.000000,7238000.0,Uptrend,Downtrend,14.103591,622.007141,26.320588,24.2581
3,584.619995,570.179993,7784500.0,Downtrend,Uptrend,25.019894,616.982498,5.280249,18.9452
4,584.000000,574.479980,4807200.0,Downtrend,Downtrend,19.870135,614.317776,9.932909,20.6469
...,...,...,...,...,...,...,...,...,...
1058,1956.489990,1955.489990,6221300.0,Uptrend,Downtrend,48.107473,1835.497131,99.543762,19.6524
1059,1939.790039,1900.099976,5387900.0,Downtrend,Uptrend,52.576539,1832.769270,82.808101,46.8092
1060,1973.630005,1963.949951,6126100.0,Uptrend,Downtrend,54.922371,1836.654986,97.215174,-3.1266
1061,1993.020020,1949.719971,5123600.0,Downtrend,Downtrend,53.668012,1843.761414,88.201300,18.2522


----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 60436)
Traceback (most recent call last):
  File "C:\Users\taylankabbani2019\AppData\Local\Continuum\anaconda3\lib\socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "C:\Users\taylankabbani2019\AppData\Local\Continuum\anaconda3\lib\socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "C:\Users\taylankabbani2019\AppData\Local\Continuum\anaconda3\lib\socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "C:\Users\taylankabbani2019\AppData\Local\Continuum\anaconda3\lib\socketserver.py", line 720, in __init__
    self.handle()
  File "C:\Users\taylankabbani2019\Documents\Spark\python\pyspark\accumulators.py", line 268, in handle
    poll(accum_updates)
  File "C:\Users\taylankabbani2019\Documents\Spark\python\pyspark

## Date pre-processing

### Feature correlation

In [4]:
# Creating features vector,lable column and handling categorical variables(uptrend =0, Downtrend =1)
formula = RFormula(formula="tomorrow_trend ~. - Date - Symbol",
                   featuresCol="features",
                   labelCol="label",
                  stringIndexerOrderType = 'frequencyAsc')

# Merge lable and features vector in one vector
assembler = VectorAssembler(inputCols=['label','features'], outputCol='vector_col')

#fit_transform the data:
corr_pipe = ml.Pipeline(stages = [formula,assembler])
corr_vector = corr_pipe.fit(AMZN_df).transform(AMZN_df)

# Pairwise correlation matrix between attributes
corr_matrix = Correlation.corr(corr_vector,'vector_col').collect()[0][0].toArray()

# correlation matrix plot 
attr = ['tomorrow_trend','High','Low','Open','Close','Volume','Adj_close','today_trend','RSI','SMA','K', 'sentement_score'] 
corr = pd.DataFrame(corr_matrix,columns =attr, index= attr )
corr.style.background_gradient(cmap= 'PuBu')

Unnamed: 0,tomorrow_trend,High,Low,Open,Close,Volume,Adj_close,today_trend,RSI,SMA,K,sentement_score
tomorrow_trend,1.0,0.043866,0.043962,0.0448986,0.0436724,0.0232528,0.0436724,-0.0124085,0.03074,0.0409609,0.00852896,0.0819178
High,0.043866,1.0,0.999194,0.999582,0.999503,0.148424,0.999503,-0.0109983,-0.000203654,0.993375,-0.0316632,0.566725
Low,0.043962,0.999194,1.0,0.999422,0.999527,0.118131,0.999527,-0.00914188,0.010308,0.991417,-0.0188782,0.566621
Open,0.0448986,0.999582,0.999422,1.0,0.999055,0.1361,0.999055,-0.0246161,0.00533487,0.992706,-0.0324275,0.566785
Close,0.0436724,0.999503,0.999527,0.999055,1.0,0.132562,1.0,0.00373069,0.00319335,0.992337,-0.0194872,0.566883
Volume,0.0232528,0.148424,0.118131,0.1361,0.132562,1.0,0.132562,-0.0413596,-0.197467,0.161649,-0.238826,0.113567
Adj_close,0.0436724,0.999503,0.999527,0.999055,1.0,0.132562,1.0,0.00373069,0.00319335,0.992337,-0.0194872,0.566883
today_trend,-0.0124085,-0.0109983,-0.00914188,-0.0246161,0.00373069,-0.0413596,0.00373069,1.0,-0.0243801,-0.0170135,0.279769,-0.0228326
RSI,0.03074,-0.000203654,0.010308,0.00533487,0.00319335,-0.197467,0.00319335,-0.0243801,1.0,-0.0776002,0.660762,0.00646889
SMA,0.0409609,0.993375,0.991417,0.992706,0.992337,0.161649,0.992337,-0.0170135,-0.0776002,1.0,-0.117091,0.558697


### Building ML pipeline

In [5]:
# split the data, %80 train %20 test
train , test = AMZN_df.randomSplit([0.8, 0.2], seed = 1996)

print("Number or rows in train set: {} \nNumber of rows is test set: {}".format(train.count(),test.count()))

Number or rows in train set: 869 
Number of rows is test set: 194


In [6]:
""" rformula does the following:
    1) Seperate the data columns into label column and features vectore
    2) Drop highly correlated features
    3) One-hot-encoding (uptrend =0, Downtrend =1)
    """
rformula = RFormula(formula="tomorrow_trend ~. -Date -Symbol -Low -Open -Adj_close +RSI:SMA",
                   featuresCol="features",
                   labelCol="label",
                  stringIndexerOrderType = 'frequencyAsc')


# standardization
# scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures", min= -1, max=1 )



#Building pipeline
ml_pipe = ml.Pipeline(stages =[rformula, scaler])


# Preparing train and test data to be fed to ML model
ml_model = ml_pipe.fit(train)
ml_train = ml_model.transform(train).select("label","scaledFeatures")
ml_test = ml_model.transform(test).select("label","scaledFeatures")

In [35]:
ml_train.show(5)
ml_test.show(5)

+-----+--------------------+
|label|      scaledFeatures|
+-----+--------------------+
|  0.0|[-0.8494726478681...|
|  1.0|[-0.8714880757038...|
|  0.0|[-0.8996129697682...|
|  0.0|[-0.8873703798536...|
|  0.0|[-0.8740288000692...|
+-----+--------------------+
only showing top 5 rows

+-----+--------------------+
|label|      scaledFeatures|
+-----+--------------------+
|  1.0|[-0.8434340818850...|
|  0.0|[-0.8923217846955...|
|  1.0|[-0.8930544446087...|
|  0.0|[-0.9942332062706...|
|  0.0|[-0.9634257873497...|
+-----+--------------------+
only showing top 5 rows



### Define evaluation metrics

In [8]:
# Accuracy Evaluater
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [9]:
# Confusion matrix
def conf_matrix(df):
    matrix_cof = MulticlassMetrics(df.select("prediction", "label") \
                                           .rdd.map(lambda x: (float(x[0]),float(x[1]))))
    print("model's Accuracy: {} \nmodel's precision(Uptrend): {} \nmodel's precision(Downtrend): {}  \nmodel's Recall(Uptrend): {} \nmodel's Recall(Downtrend): {} \nmodel's f1_score(Uptrend): {} \nmodel's f1_score(Downtrend): {}" \
          .format(matrix_cof.accuracy, matrix_cof.precision(0.0), matrix_cof.precision(1.0), 
                 matrix_cof.recall(0.0), matrix_cof.recall(1.0), matrix_cof.weightedFMeasure(0.0), matrix_cof.weightedFMeasure(1.0)))
    print(matrix_cof.confusionMatrix().toArray())

# Run ML models
### All models will be trained by grid search with 10-fold-cross_validation

### Logistic Regression

In [10]:
#Logistic regression model
lr = LogisticRegression(featuresCol='scaledFeatures', labelCol='label')

# Hyperparameter grid
params_lr = ParamGridBuilder() \
            .addGrid(lr.elasticNetParam, [0, 0.25, 0.5, 0.75, 1]) \
            .addGrid(lr.regParam, [0.001, 0.01, 0.1, 1, 10]) \
            .addGrid(lr.fitIntercept, [True, False]) \
            .build()

# 10-fold cross validation
lr_cv = CrossValidator(estimator=lr,
                          estimatorParamMaps=params_lr,
                          evaluator= accuracy_evaluator,
                          numFolds=10,
                          seed=1999,
                          parallelism=4) 


# Training the model
lr_model = lr_cv.fit(ml_train)

# # The cross validation accuracy for each model:
# print("The avg accuracy of each model: {}".format(lr_model.avgMetrics))

# Best model params:
print(lr_model.bestModel.explainParam('elasticNetParam'),lr_model.bestModel.explainParam('regParam'),
      lr_model.bestModel.explainParam('fitIntercept'), sep ='\n')


elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0, current: 0.0)
regParam: regularization parameter (>= 0). (default: 0.0, current: 0.001)
fitIntercept: whether to fit an intercept term. (default: True, current: True)


In [11]:
# Train metices
lr_train = lr_model.transform(ml_train)
conf_matrix(lr_train)

model's Accuracy: 0.57307249712313 
model's precision(Uptrend): 0.5727891156462585 
model's precision(Downtrend): 0.5746268656716418  
model's Recall(Uptrend): 0.8807531380753139 
model's Recall(Downtrend): 0.1969309462915601 
model's f1_score(Uptrend): 0.5736159974183239 
model's f1_score(Downtrend): 0.513803770747221
[[421.  57.]
 [314.  77.]]


In [12]:
# Test the model
lr_test = lr_model.transform(ml_test)
conf_matrix(lr_test)

model's Accuracy: 0.5824742268041238 
model's precision(Uptrend): 0.5974842767295597 
model's precision(Downtrend): 0.5142857142857142  
model's Recall(Uptrend): 0.8482142857142857 
model's Recall(Downtrend): 0.21951219512195122 
model's f1_score(Uptrend): 0.5623178740471095 
model's f1_score(Downtrend): 0.5348183220135135
[[95. 17.]
 [64. 18.]]


### Random Forest

In [13]:
#Random Forest model
RF = RandomForestClassifier(featuresCol='scaledFeatures', labelCol='label')

# Hyperparameter grid
params_RF = ParamGridBuilder() \
            .addGrid(RF.maxDepth, [2, 3, 4]) \
            .addGrid(RF.numTrees, [5, 8, 10, 12]) \
            .addGrid(RF.minInstancesPerNode, [1, 2, 3, 4]) \
            .addGrid(RF.minInfoGain, [0 ,1, 2]) \
            .addGrid(RF.featureSubsetStrategy,["auto", "all", "sqrt", "log2"]) \
            .addGrid(RF.impurity, ["entropy","gini"]) \
            .build()
    
# 10-fold cross validation
RF_cv = CrossValidator(estimator=RF,
                          estimatorParamMaps=params_RF,
                          evaluator= accuracy_evaluator,
                          numFolds=5,
                          seed=1999,
                          parallelism=10) 

# Training the model
RF_model = RF_cv.fit(ml_train)

# # The cross validation accuracy for each model:
# print("The avg accuracy of each model: {}".format(RF_model.avgMetrics))

# # # Best model params:
print(RF_model.bestModel.explainParam('maxDepth'),RF_model.bestModel.explainParam('numTrees'),
      RF_model.bestModel.explainParam('minInstancesPerNode'), RF_model.bestModel.explainParam('minInfoGain'),
      RF_model.bestModel.explainParam('featureSubsetStrategy'), RF_model.bestModel.explainParam('impurity'), sep ='\n')

maxDepth: Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. (default: 5, current: 2)
numTrees: Number of trees to train (>= 1). (default: 20, current: 12)
minInstancesPerNode: Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1. (default: 1, current: 1)
minInfoGain: Minimum information gain for a split to be considered at a tree node. (default: 0.0, current: 0.0)
featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use log2(number of features)), 'n' (when 

In [14]:
# Train metices
RF_train = RF_model.transform(ml_train)
conf_matrix(RF_train)

model's Accuracy: 0.5776754890678941 
model's precision(Uptrend): 0.5705209656925032 
model's precision(Downtrend): 0.6463414634146342  
model's Recall(Uptrend): 0.9393305439330544 
model's Recall(Downtrend): 0.13554987212276215 
model's f1_score(Uptrend): 0.6046358271532088 
model's f1_score(Downtrend): 0.4913083990851982
[[449.  29.]
 [338.  53.]]


In [15]:
# Test the model
RF_test = RF_model.transform(ml_test)
conf_matrix(RF_test)

model's Accuracy: 0.5876288659793815 
model's precision(Uptrend): 0.5898876404494382 
model's precision(Downtrend): 0.5625  
model's Recall(Uptrend): 0.9375 
model's Recall(Downtrend): 0.10975609756097561 
model's f1_score(Uptrend): 0.5783114212903973 
model's f1_score(Downtrend): 0.4956941895137009
[[105.   7.]
 [ 73.   9.]]


### Gradient Boosting Machine

In [16]:
#Gradient Boosting model
GBM = GBTClassifier(featuresCol='scaledFeatures', labelCol='label')

# Hyperparameter grid
params_GBM = ParamGridBuilder() \
            .addGrid(GBM.maxDepth, [1, 2, 4]) \
            .addGrid(GBM.minInstancesPerNode, [1, 2, 3]) \
            .addGrid(GBM.minInfoGain, [0,1,2]) \
            .addGrid(GBM.stepSize, [0.1, 0.5, 1]) \
            .build()

# 10-fold cross validation
GBM_cv = CrossValidator(estimator=GBM,
                          estimatorParamMaps=params_GBM,
                          evaluator= accuracy_evaluator,
                          numFolds=10,
                          seed=1996,
                          parallelism=10) 
# Training the model
GBM_model = GBM_cv.fit(ml_train)

# # The cross validation accuracy for each model:
# print("The avg accuracy of each model: {}".format(RF_model.avgMetrics))

# Best model params:
print(GBM_model.bestModel.explainParam('maxDepth'),GBM_model.bestModel.explainParam('minInstancesPerNode'),
      GBM_model.bestModel.explainParam('minInfoGain'), GBM_model.bestModel.explainParam('featureSubsetStrategy'), 
      GBM_model.bestModel.explainParam('stepSize'), sep ='\n')

maxDepth: Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. (default: 5, current: 1)
minInstancesPerNode: Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1. (default: 1, current: 1)
minInfoGain: Minimum information gain for a split to be considered at a tree node. (default: 0.0, current: 0.0)
featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use log2(number of features)), 'n' (when n is in the range (0, 1.0], use n * number of features. When n is in t

In [17]:
# Train metices
GBM_train = GBM_model.transform(ml_train)
conf_matrix(GBM_train)

model's Accuracy: 0.5960874568469505 
model's precision(Uptrend): 0.5845539280958721 
model's precision(Downtrend): 0.6694915254237288  
model's Recall(Uptrend): 0.9184100418410042 
model's Recall(Downtrend): 0.2020460358056266 
model's f1_score(Uptrend): 0.6227709598049538 
model's f1_score(Downtrend): 0.5326299766954038
[[439.  39.]
 [312.  79.]]


In [18]:
# Test the model
GBM_test = GBM_model.transform(ml_test)
conf_matrix(GBM_test)

model's Accuracy: 0.5412371134020618 
model's precision(Uptrend): 0.5664739884393064 
model's precision(Downtrend): 0.3333333333333333  
model's Recall(Uptrend): 0.875 
model's Recall(Downtrend): 0.08536585365853659 
model's f1_score(Uptrend): 0.46793000019863734 
model's f1_score(Downtrend): 0.4544855282034532
[[98. 14.]
 [75.  7.]]
