In [1]:
import findspark
findspark.init()
findspark.find()
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import length, col
from pyspark import SparkContext
sc = SparkContext()
sql = SQLContext(sc)

In [2]:
import numpy as np
import pandas as pd
import yfinance as yf
import pyspark.ml as ml
from functools import reduce
import pandas_datareader as pdr
import matplotlib.pyplot as plt
from pyspark.sql.types import *
from pyspark.ml.feature import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import  BinaryClassificationMetrics,MulticlassMetrics
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# Import data

In [3]:
# Financial Data
NFLX_finance = sql.read.csv("NFLX_stock", inferSchema = True,header = True)

# News sentement data
NFLX_news = sql.read.csv("NFLX_Sentiment.csv", inferSchema = True,header = True)

# Join two dataframes
NFLX_df = NFLX_finance.join(NFLX_news, "Date")

## Date pre-processing

### Feature correlation

In [4]:
# Creating features vector,lable column and handling categorical variables(uptrend =0, Downtrend =1)
formula = RFormula(formula="tomorrow_trend ~. - Date - Symbol",
                   featuresCol="features",
                   labelCol="label",
                  stringIndexerOrderType = 'frequencyAsc')

# Merge lable and features vector in one vector
assembler = VectorAssembler(inputCols=['label','features'], outputCol='vector_col')

#fit_transform the data:
corr_pipe = ml.Pipeline(stages = [formula,assembler])
corr_vector = corr_pipe.fit(NFLX_df).transform(NFLX_df)

# Pairwise correlation matrix between attributes
corr_matrix = Correlation.corr(corr_vector,'vector_col').collect()[0][0].toArray()

# correlation matrix plot 
attr = ['tomorrow_trend','High','Low','Open','Close','Volume','Adj_close','today_trend','RSI','SMA','K', 'sentement_score'] 
corr = pd.DataFrame(corr_matrix,columns =attr, index= attr )
corr.style.background_gradient(cmap= 'PuBu')

Unnamed: 0,tomorrow_trend,High,Low,Open,Close,Volume,Adj_close,today_trend,RSI,SMA,K,sentement_score
tomorrow_trend,1.0,0.0380111,0.0384647,0.0380753,0.0403519,0.0186191,0.0403519,0.0914758,0.0464612,0.030505,0.0862078,0.0351938
High,0.0380111,1.0,0.999103,0.999372,0.999333,-0.0984988,0.999333,0.0180353,0.0415476,0.990441,0.0486123,0.482398
Low,0.0384647,0.999103,1.0,0.999289,0.999353,-0.12081,0.999353,0.0187977,0.0491839,0.988515,0.059519,0.478653
Open,0.0380753,0.999372,0.999289,1.0,0.998617,-0.108688,0.998617,0.000416534,0.0459727,0.989724,0.0460945,0.480483
Close,0.0403519,0.999333,0.999353,0.998617,1.0,-0.10879,1.0,0.0354186,0.0445768,0.989053,0.0623002,0.48068
Volume,0.0186191,-0.0984988,-0.12081,-0.108688,-0.10879,1.0,-0.10879,0.0243052,-0.0415023,-0.102383,-0.106967,0.0905213
Adj_close,0.0403519,0.999333,0.999353,0.998617,1.0,-0.10879,1.0,0.0354186,0.0445768,0.989053,0.0623002,0.48068
today_trend,0.0914758,0.0180353,0.0187977,0.000416534,0.0354186,0.0243052,0.0354186,1.0,-0.0418142,0.00822579,0.271028,0.000483004
RSI,0.0464612,0.0415476,0.0491839,0.0459727,0.0445768,-0.0415023,0.0445768,-0.0418142,1.0,-0.0536793,0.709245,0.00993108
SMA,0.030505,0.990441,0.988515,0.989724,0.989053,-0.102383,0.989053,0.00822579,-0.0536793,1.0,-0.0548471,0.476056


### Building ML pipeline

In [5]:
# split the data, %80 train %20 test
train , test = NFLX_df.randomSplit([0.8, 0.2], seed = 1996)

print("Number or rows in train set: {} \nNumber of rows is test set: {}".format(train.count(),test.count()))

Number or rows in train set: 871 
Number of rows is test set: 195


In [6]:
""" rformula does the following:
    1) Seperate the data columns into label column and features vectore
    2) Drop highly correlated features
    3) One-hot-encoding (uptrend =0, Downtrend =1)
    """
rformula = RFormula(formula="tomorrow_trend ~. -Date -Symbol -Low -Open -Adj_close +RSI:SMA",
                   featuresCol="features",
                   labelCol="label",
                  stringIndexerOrderType = 'frequencyAsc')


# standardization
# scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures", min= -1, max=1 )



#Building pipeline
ml_pipe = ml.Pipeline(stages =[rformula, scaler])


# Preparing train and test data to be fed to ML model
ml_model = ml_pipe.fit(train)
ml_train = ml_model.transform(train).select("label","scaledFeatures")
ml_test = ml_model.transform(test).select("label","scaledFeatures")

In [7]:
ml_train.show(5)
ml_test.show(5)

+-----+--------------------+
|label|      scaledFeatures|
+-----+--------------------+
|  0.0|[-0.8049097231497...|
|  0.0|[-0.8104043928508...|
|  1.0|[-0.8549525659499...|
|  0.0|[-0.8771084848938...|
|  1.0|[-0.8496942314600...|
+-----+--------------------+
only showing top 5 rows

+-----+--------------------+
|label|      scaledFeatures|
+-----+--------------------+
|  1.0|[-0.7785589556282...|
|  1.0|[-0.8045552428710...|
|  0.0|[-0.8240524244959...|
|  1.0|[-0.9613009768807...|
|  1.0|[-0.9556290670402...|
+-----+--------------------+
only showing top 5 rows



### Define evaluation metrics

In [8]:
# Accuracy Evaluater
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [9]:
# Confusion matrix
def conf_matrix(df):
    matrix_cof = MulticlassMetrics(df.select("prediction", "label") \
                                           .rdd.map(lambda x: (float(x[0]),float(x[1]))))
    print("model's Accuracy: {} \nmodel's precision(Uptrend): {} \nmodel's precision(Downtrend): {}  \nmodel's Recall(Uptrend): {} \nmodel's Recall(Downtrend): {} \nmodel's f1_score(Uptrend): {} \nmodel's f1_score(Downtrend): {}" \
          .format(matrix_cof.accuracy, matrix_cof.precision(0.0), matrix_cof.precision(1.0), 
                 matrix_cof.recall(0.0), matrix_cof.recall(1.0), matrix_cof.weightedFMeasure(0.0), matrix_cof.weightedFMeasure(1.0)))
    print(matrix_cof.confusionMatrix().toArray())

# Run ML models
### All models will be trained by grid search with 10-fold-cross_validation

### Logistic Regression

In [10]:
#Logistic regression model
lr = LogisticRegression(featuresCol='scaledFeatures', labelCol='label')

# Hyperparameter grid
params_lr = ParamGridBuilder() \
            .addGrid(lr.elasticNetParam, [0, 0.25, 0.5, 0.75, 1]) \
            .addGrid(lr.regParam, [0.001, 0.01, 0.1, 1, 10]) \
            .addGrid(lr.fitIntercept, [True, False]) \
            .build()

# 10-fold cross validation
lr_cv = CrossValidator(estimator=lr,
                          estimatorParamMaps=params_lr,
                          evaluator= accuracy_evaluator,
                          numFolds=10,
                          seed=1999,
                          parallelism=4) 


# Training the model
lr_model = lr_cv.fit(ml_train)

# # The cross validation accuracy for each model:
# print("The avg accuracy of each model: {}".format(lr_model.avgMetrics))

# Best model params:
print(lr_model.bestModel.explainParam('elasticNetParam'),lr_model.bestModel.explainParam('regParam'),
      lr_model.bestModel.explainParam('fitIntercept'), sep ='\n')


elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0, current: 0.0)
regParam: regularization parameter (>= 0). (default: 0.0, current: 0.1)
fitIntercept: whether to fit an intercept term. (default: True, current: False)


In [11]:
# Train metices
lr_train = lr_model.transform(ml_train)
conf_matrix(lr_train)

model's Accuracy: 0.5545350172215844 
model's precision(Uptrend): 0.5508771929824562 
model's precision(Downtrend): 0.5614617940199336  
model's Recall(Uptrend): 0.7040358744394619 
model's Recall(Downtrend): 0.3976470588235294 
model's f1_score(Uptrend): 0.5560418949812254 
model's f1_score(Downtrend): 0.543676439871912
[[314. 132.]
 [256. 169.]]


In [12]:
# Test the model
lr_test = lr_model.transform(ml_test)
conf_matrix(lr_test)

model's Accuracy: 0.6102564102564103 
model's precision(Uptrend): 0.5869565217391305 
model's precision(Downtrend): 0.6666666666666666  
model's Recall(Uptrend): 0.81 
model's Recall(Downtrend): 0.4 
model's f1_score(Uptrend): 0.6257896692679301 
model's f1_score(Downtrend): 0.5926524455936221
[[81. 19.]
 [57. 38.]]


### Random Forest

In [13]:
#Random Forest model
RF = RandomForestClassifier(featuresCol='scaledFeatures', labelCol='label')

# Hyperparameter grid
params_RF = ParamGridBuilder() \
            .addGrid(RF.maxDepth, [2, 3, 4]) \
            .addGrid(RF.numTrees, [5, 8, 10, 12]) \
            .addGrid(RF.minInstancesPerNode, [1, 2, 3, 4]) \
            .addGrid(RF.minInfoGain, [0 ,1, 2]) \
            .addGrid(RF.featureSubsetStrategy,["auto", "all", "sqrt", "log2"]) \
            .addGrid(RF.impurity, ["entropy","gini"]) \
            .build()
    
# 10-fold cross validation
RF_cv = CrossValidator(estimator=RF,
                          estimatorParamMaps=params_RF,
                          evaluator= accuracy_evaluator,
                          numFolds=5,
                          seed=1999,
                          parallelism=10) 

# Training the model
RF_model = RF_cv.fit(ml_train)

# # The cross validation accuracy for each model:
# print("The avg accuracy of each model: {}".format(RF_model.avgMetrics))

# # # Best model params:
print(RF_model.bestModel.explainParam('maxDepth'),RF_model.bestModel.explainParam('numTrees'),
      RF_model.bestModel.explainParam('minInstancesPerNode'), RF_model.bestModel.explainParam('minInfoGain'),
      RF_model.bestModel.explainParam('featureSubsetStrategy'), RF_model.bestModel.explainParam('impurity'), sep ='\n')

maxDepth: Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. (default: 5, current: 4)
numTrees: Number of trees to train (>= 1). (default: 20, current: 12)
minInstancesPerNode: Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1. (default: 1, current: 4)
minInfoGain: Minimum information gain for a split to be considered at a tree node. (default: 0.0, current: 0.0)
featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use log2(number of features)), 'n' (when 

In [15]:
# Train metices
RF_train = RF_model.transform(ml_train)
conf_matrix(RF_train)

model's Accuracy: 0.6555683122847302 
model's precision(Uptrend): 0.6356877323420075 
model's precision(Downtrend): 0.6876876876876877  
model's Recall(Uptrend): 0.7668161434977578 
model's Recall(Downtrend): 0.5388235294117647 
model's f1_score(Uptrend): 0.6610608448815185 
model's f1_score(Downtrend): 0.6507676067675798
[[342. 104.]
 [196. 229.]]


In [16]:
# Test the model
RF_test = RF_model.transform(ml_test)
conf_matrix(RF_test)

model's Accuracy: 0.6358974358974359 
model's precision(Uptrend): 0.6106870229007634 
model's precision(Downtrend): 0.6875  
model's Recall(Uptrend): 0.8 
model's Recall(Downtrend): 0.4631578947368421 
model's f1_score(Uptrend): 0.6481087296926992 
model's f1_score(Downtrend): 0.6248342852116437
[[80. 20.]
 [51. 44.]]


### Gradient Boosting Machine

In [17]:
#Gradient Boosting model
GBM = GBTClassifier(featuresCol='scaledFeatures', labelCol='label')

# Hyperparameter grid
params_GBM = ParamGridBuilder() \
            .addGrid(GBM.maxDepth, [1, 2, 4]) \
            .addGrid(GBM.minInstancesPerNode, [1, 2, 3]) \
            .addGrid(GBM.minInfoGain, [0,1,2]) \
            .addGrid(GBM.stepSize, [0.1, 0.5, 1]) \
            .build()

# 10-fold cross validation
GBM_cv = CrossValidator(estimator=GBM,
                          estimatorParamMaps=params_GBM,
                          evaluator= accuracy_evaluator,
                          numFolds=10,
                          seed=1996,
                          parallelism=10) 
# Training the model
GBM_model = GBM_cv.fit(ml_train)

# # The cross validation accuracy for each model:
# print("The avg accuracy of each model: {}".format(RF_model.avgMetrics))

# Best model params:
print(GBM_model.bestModel.explainParam('maxDepth'),GBM_model.bestModel.explainParam('minInstancesPerNode'),
      GBM_model.bestModel.explainParam('minInfoGain'), GBM_model.bestModel.explainParam('featureSubsetStrategy'), 
      GBM_model.bestModel.explainParam('stepSize'), sep ='\n')

maxDepth: Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. (default: 5, current: 1)
minInstancesPerNode: Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1. (default: 1, current: 1)
minInfoGain: Minimum information gain for a split to be considered at a tree node. (default: 0.0, current: 0.0)
featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use log2(number of features)), 'n' (when n is in the range (0, 1.0], use n * number of features. When n is in t

In [18]:
# Train metices
GBM_train = GBM_model.transform(ml_train)
conf_matrix(GBM_train)

model's Accuracy: 0.6268656716417911 
model's precision(Uptrend): 0.6198019801980198 
model's precision(Downtrend): 0.6366120218579235  
model's Recall(Uptrend): 0.7017937219730942 
model's Recall(Downtrend): 0.548235294117647 
model's f1_score(Uptrend): 0.6280043541422897 
model's f1_score(Downtrend): 0.624524408629205
[[313. 133.]
 [192. 233.]]


In [19]:
# Test the model
GBM_test = GBM_model.transform(ml_test)
conf_matrix(GBM_test)

model's Accuracy: 0.6102564102564103 
model's precision(Uptrend): 0.5967741935483871 
model's precision(Downtrend): 0.6338028169014085  
model's Recall(Uptrend): 0.74 
model's Recall(Downtrend): 0.47368421052631576 
model's f1_score(Uptrend): 0.6148137792844744 
model's f1_score(Downtrend): 0.60296129573238
[[74. 26.]
 [50. 45.]]
