In [1]:
import pyspark
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

In [2]:
sentences = [["2012-01-01", 'I will kill you', "my dad is good", "Cov19 good", "oh my load, what are you doing!"], 
              ["2012-01-02", 'I will kill you', "my dad is good", "Cov19 good", "oh my load, what are you doing!"]]

stockRetures = {"2012-01-01": 0.1, "2012-01-02": -0.1}
positiveWords = sc.broadcast({'good'})
negativeWords = sc.broadcast({'kill'})
wordsRDD = sc.parallelize(sentences, 4).flatMap(lambda x: [(x[0], i) for i in x[1:]])
stockReturesRDD = sc.parallelize([(k, 1) if v > 0 else (k, 0) for k, v in stockRetures.items()], 4)

In [3]:
wordsRDD.collect()

In [4]:
def cleanSentence(string):
  # step 1: clean the stop words and punctuation
  return string

def splitSentence(string):
  return string.strip().split(" ")

def countPosWords(string):
  n = 0
  for w in string:
    if w in positiveWords.value:
      n += 1
  return n

def countNegWords(string):
  n = 0
  for w in string:
    if w in negativeWords.value:
      n += 1
  return n

wordsSplitRDD = wordsRDD.mapValues(cleanSentence).mapValues(splitSentence).persist()
posCountRDD = wordsSplitRDD.mapValues(countPosWords)
negCountRDD = wordsSplitRDD.mapValues(countNegWords)

In [5]:
posCountEachDay = posCountRDD.reduceByKey(lambda x, y: x + y)
negCountEachDay = negCountRDD.reduceByKey(lambda x, y: x + y)

In [6]:
allData = stockReturesRDD.join(posCountEachDay).join(negCountEachDay).mapValues(lambda a : (a[0][0], a[0][1], a[1])).map(lambda v: (v[0], v[1][0], v[1][1], v[1][2]))

In [7]:
allData.collect()

In [8]:
addDF = allData.toDF(['date', 'PE', 'pos', 'neg'])
vectorize = VectorAssembler(inputCols=["pos", "neg"], outputCol="features")

In [9]:
lr = LinearRegression()
lr.setPredictionCol("Predicted_PE")\
  .setLabelCol("PE")\
  .setMaxIter(100)\
  .setRegParam(0.1)

lrPipeline = Pipeline(stages=[vectorize, lr])
lrModel = lrPipeline.fit(addDF)

In [10]:
display(lrModel.transform(addDF))

date,PE,pos,neg,features,Predicted_PE
2012-01-01,1,2,1,"List(1, 2, List(), List(2.0, 1.0))",0.5
2012-01-02,0,2,1,"List(1, 2, List(), List(2.0, 1.0))",0.5
