In [1]:
import findspark
findspark.init()
findspark.find()
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import length, col
from pyspark import SparkContext
sc = SparkContext()
sql = SQLContext(sc)

In [2]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import lower
from pyspark.sql.functions import regexp_replace
from nltk.tokenize import word_tokenize
from pyspark.sql import Row
from pyspark.ml.feature import StopWordsRemover
from nltk.stem.snowball import SnowballStemmer
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.ml.feature import CountVectorizer
import pyspark.sql.functions as F
from pyspark.sql.functions import sum as _sum
from pyspark.sql.functions import to_timestamp
from pyspark import StorageLevel

# Data importing

In [3]:
news_data = spark.read.csv("file:///C:\spark/spark/bin/all-the-news-2-1.csv", inferSchema = True,header = True)

In [4]:
news = news_data.select("Date","Title","Article")

# Data Cleaning

#### Regex & Lowering Data before selecting stock.

In [5]:
news_removed_punctuations = news.select('*', (lower(regexp_replace('Article', "[^a-zA-Z\\s]", "")).alias('Regex Removed & Lower Cased Articles')))

## Getting stock news

In [6]:
def filterStock(stockName, df):
    if stockName == "apple":
        return df.filter(df['Regex Removed & Lower Cased Articles'].contains(stockName) & ~df['Regex Removed & Lower Cased Articles'].contains("fruit"))
    elif stockName == "amazon":
        return df.filter(df['Regex Removed & Lower Cased Articles'].contains(stockName) & ~df['Regex Removed & Lower Cased Articles'].contains("forest"))
    else:
        return df.filter(df['Regex Removed & Lower Cased Articles'].contains(stockName))              

In [7]:
AAPL_news = filterStock("apple",news_removed_punctuations)
AMZN_news = filterStock('amazon',news_removed_punctuations)
NFLX_news = filterStock('netflix',news_removed_punctuations)

In [8]:
print("Number of NFLX news: {}\nNumber of AMZN news: {}\nNumber of AAPL news: {}" \
     .format(NFLX_news.count(),AMZN_news.count(),AAPL_news.count()))

Number of NFLX news: 35213
Number of AMZN news: 66259
Number of AAPL news: 86935


In [9]:
AAPL_news.show(1)

+-------------------+--------------------+--------------------+------------------------------------+
|               Date|               Title|             Article|Regex Removed & Lower Cased Articles|
+-------------------+--------------------+--------------------+------------------------------------+
|2017-09-26 09:00:02|Where the softwar...|Software companie...|                software companie...|
+-------------------+--------------------+--------------------+------------------------------------+
only showing top 1 row



In [10]:
AMZN_news.show(1)

+-------------------+--------------------+--------------------+------------------------------------+
|               Date|               Title|             Article|Regex Removed & Lower Cased Articles|
+-------------------+--------------------+--------------------+------------------------------------+
|2018-11-24 00:00:00|On Black Friday, ...|NEW YORK (Reuters...|                new york reuters ...|
+-------------------+--------------------+--------------------+------------------------------------+
only showing top 1 row



In [11]:
NFLX_news.show(1)

+-------------------+--------------------+--------------------+------------------------------------+
|               Date|               Title|             Article|Regex Removed & Lower Cased Articles|
+-------------------+--------------------+--------------------+------------------------------------+
|2017-11-30 20:12:02|Forget Facebook, ...|"Facebook, Amazon...|                facebook amazon n...|
+-------------------+--------------------+--------------------+------------------------------------+
only showing top 1 row



### Tokenizating Articles

In [12]:
def tokenize(df):
    tokenizer = Tokenizer(inputCol="Regex Removed & Lower Cased Articles", outputCol="Tokenized Articles")
    return tokenizer.transform(df)

In [13]:
AAPL_news = tokenize(AAPL_news)
AMZN_news = tokenize(AMZN_news)
NFLX_news = tokenize(NFLX_news)

In [14]:
AAPL_news.show(1)

+-------------------+--------------------+--------------------+------------------------------------+--------------------+
|               Date|               Title|             Article|Regex Removed & Lower Cased Articles|  Tokenized Articles|
+-------------------+--------------------+--------------------+------------------------------------+--------------------+
|2017-09-26 09:00:02|Where the softwar...|Software companie...|                software companie...|[software, compan...|
+-------------------+--------------------+--------------------+------------------------------------+--------------------+
only showing top 1 row



In [15]:
AMZN_news.show(1)

+-------------------+--------------------+--------------------+------------------------------------+--------------------+
|               Date|               Title|             Article|Regex Removed & Lower Cased Articles|  Tokenized Articles|
+-------------------+--------------------+--------------------+------------------------------------+--------------------+
|2018-11-24 00:00:00|On Black Friday, ...|NEW YORK (Reuters...|                new york reuters ...|[new, york, reute...|
+-------------------+--------------------+--------------------+------------------------------------+--------------------+
only showing top 1 row



In [16]:
NFLX_news.show(1)

+-------------------+--------------------+--------------------+------------------------------------+--------------------+
|               Date|               Title|             Article|Regex Removed & Lower Cased Articles|  Tokenized Articles|
+-------------------+--------------------+--------------------+------------------------------------+--------------------+
|2017-11-30 20:12:02|Forget Facebook, ...|"Facebook, Amazon...|                facebook amazon n...|[facebook, amazon...|
+-------------------+--------------------+--------------------+------------------------------------+--------------------+
only showing top 1 row



### Removing stop words

In [17]:
def stop_word_remove(df):
    remover = StopWordsRemover(inputCol="Tokenized Articles", outputCol="Articles without stop words")
    return remover.transform(df)

In [18]:
AAPL_news = stop_word_remove(AAPL_news)
AMZN_news = stop_word_remove(AMZN_news)
NFLX_news = stop_word_remove(NFLX_news)

In [19]:
AAPL_news.show(1)

+-------------------+--------------------+--------------------+------------------------------------+--------------------+---------------------------+
|               Date|               Title|             Article|Regex Removed & Lower Cased Articles|  Tokenized Articles|Articles without stop words|
+-------------------+--------------------+--------------------+------------------------------------+--------------------+---------------------------+
|2017-09-26 09:00:02|Where the softwar...|Software companie...|                software companie...|[software, compan...|       [software, compan...|
+-------------------+--------------------+--------------------+------------------------------------+--------------------+---------------------------+
only showing top 1 row



In [20]:
AMZN_news.show(1)

+-------------------+--------------------+--------------------+------------------------------------+--------------------+---------------------------+
|               Date|               Title|             Article|Regex Removed & Lower Cased Articles|  Tokenized Articles|Articles without stop words|
+-------------------+--------------------+--------------------+------------------------------------+--------------------+---------------------------+
|2018-11-24 00:00:00|On Black Friday, ...|NEW YORK (Reuters...|                new york reuters ...|[new, york, reute...|       [new, york, reute...|
+-------------------+--------------------+--------------------+------------------------------------+--------------------+---------------------------+
only showing top 1 row



In [21]:
NFLX_news.show(1)

+-------------------+--------------------+--------------------+------------------------------------+--------------------+---------------------------+
|               Date|               Title|             Article|Regex Removed & Lower Cased Articles|  Tokenized Articles|Articles without stop words|
+-------------------+--------------------+--------------------+------------------------------------+--------------------+---------------------------+
|2017-11-30 20:12:02|Forget Facebook, ...|"Facebook, Amazon...|                facebook amazon n...|[facebook, amazon...|       [facebook, amazon...|
+-------------------+--------------------+--------------------+------------------------------------+--------------------+---------------------------+
only showing top 1 row



# Count Vector

## Join tokenized articles back to feed it to Sentiment Analysis

In [22]:
AAPL_news = AAPL_news.select("Date","Articles without stop words",).rdd.map(tuple).map(lambda x: (x[0],x[1]))
AMZN_news = AMZN_news.select("Date","Articles without stop words",).rdd.map(tuple).map(lambda x: (x[0],x[1]))
NFLX_news = NFLX_news.select("Date","Articles without stop words",).rdd.map(tuple).map(lambda x: (x[0],x[1]))


# Sentiment Analysis Scores

In [23]:
#Function returns a sentiment score for each article
def sentScores(x):
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    analyzer = SentimentIntensityAnalyzer()
    empty_list = []
    for element in x:
        empty_list.append(element)
    sentences = " ".join(empty_list)
    score = analyzer.polarity_scores(sentences)
    dict = score
    return dict['compound']


In [24]:
#Finding sentiment scores for each article and putting to rdd
AAPL_sentimentRDD = AAPL_news.map(lambda x: (x[0],sentScores(x[1])))
AMZN_sentimentRDD = AMZN_news.map(lambda x: (x[0],sentScores(x[1])))
NFLX_sentimentRDD = NFLX_news.map(lambda x: (x[0],sentScores(x[1])))

In [25]:
#defining a function to convert back to dataframe
def createDf(x):
    d = {}
    for i in range(len(x)):
        d[str(i)] = x[i]
    return d

In [28]:
AAPL_df = AAPL_sentimentRDD.map(lambda x: Row(**createDf(x))).toDF()
AMZN_df = AMZN_sentimentRDD.map(lambda x: Row(**createDf(x))).toDF()
NFLX_df = NFLX_sentimentRDD.map(lambda x: Row(**createDf(x))).toDF()

In [27]:
AAPL_df.show(5)

+-------------------+-------+
|                  0|      1|
+-------------------+-------+
|2017-09-26 09:00:02| 0.9923|
|2018-12-27 00:00:00|-0.8905|
|2019-03-24 00:00:00|-0.9679|
|2016-03-09 00:00:00| -0.121|
|2018-08-31 00:00:00| 0.9914|
+-------------------+-------+
only showing top 5 rows



In [29]:
AMZN_df.show(5)

+-------------------+------+
|                  0|     1|
+-------------------+------+
|2018-11-24 00:00:00|0.9859|
|2017-11-30 20:12:02|0.9973|
|2018-08-31 00:00:00|0.9914|
|2016-08-23 00:00:00|0.9911|
|2019-01-09 19:31:30|   1.0|
+-------------------+------+
only showing top 5 rows



In [30]:
NFLX_df.show(5)

+-------------------+------+
|                  0|     1|
+-------------------+------+
|2017-11-30 20:12:02|0.9973|
|2019-05-22 17:20:00|0.9979|
|2019-01-09 19:31:30|   1.0|
|2019-01-16 12:48:00|   0.0|
|2018-11-02 22:50:07|0.9897|
+-------------------+------+
only showing top 5 rows



In [33]:
#Converting back to dataframe and renamed the columns as date and sentiment score
NFLX_sentiment_df = sql.createDataFrame(NFLX_sentimentRDD).withColumnRenamed("_1","Date").withColumnRenamed("_2","Sentiment_Score")
AMZN_sentiment_df = sql.createDataFrame(AMZN_sentimentRDD).withColumnRenamed("_1","Date").withColumnRenamed("_2","Sentiment_Score")
AAPL_sentiment_df = sql.createDataFrame(AAPL_sentimentRDD).withColumnRenamed("_1","Date").withColumnRenamed("_2","Sentiment_Score")


In [34]:
NFLX_sentiment_df.show(5)

+-------------------+---------------+
|               Date|Sentiment_Score|
+-------------------+---------------+
|2017-11-30 20:12:02|         0.9973|
|2019-05-22 17:20:00|         0.9979|
|2019-01-09 19:31:30|            1.0|
|2019-01-16 12:48:00|            0.0|
|2018-11-02 22:50:07|         0.9897|
+-------------------+---------------+
only showing top 5 rows



In [35]:
#### Removing Time from Date Column
def fixdate(df):
    split_col = pyspark.sql.functions.split(df['Date'], ' ')
    date_extracted = df.withColumn('Date_1', split_col.getItem(0))
    date_extracted = date_extracted.drop("Date")
    return date_extracted

In [40]:
NFLX_fixdate = fixdate(NFLX_sentiment_df).withColumnRenamed("Date_1","Date")
AMZN_fixdate = fixdate(AMZN_sentiment_df).withColumnRenamed("Date_1","Date")
AAPL_fixdate = fixdate(AAPL_sentiment_df).withColumnRenamed("Date_1","Date")


In [41]:
AMZN_fixdate.show(5)

+---------------+----------+
|Sentiment_Score|      Date|
+---------------+----------+
|         0.9859|2018-11-24|
|         0.9973|2017-11-30|
|         0.9914|2018-08-31|
|         0.9911|2016-08-23|
|            1.0|2019-01-09|
+---------------+----------+
only showing top 5 rows



In [42]:
#Grouping news that has been published on the same day. Summing the sentiment values for day.
NFLX_groupby = NFLX_fixdate.groupBy('Date').agg(_sum('Sentiment_Score').alias('SentimentScores_Summed'))
AMZN_groupby = AMZN_fixdate.groupBy('Date').agg(_sum('Sentiment_Score').alias('SentimentScores_Summed'))
AAPL_groupby = AAPL_fixdate.groupBy('Date').agg(_sum('Sentiment_Score').alias('SentimentScores_Summed'))

In [43]:
NFLX_groupby.show(5)

+----------+----------------------+
|      Date|SentimentScores_Summed|
+----------+----------------------+
|2016-08-17|    13.309399999999998|
|2017-12-05|     4.066799999999999|
|2017-05-14|                3.9365|
|2019-08-08|    19.849600000000002|
|2019-08-22|    24.607599999999998|
+----------+----------------------+
only showing top 5 rows



In [44]:
#Converting dfs to rdd to make operation
NFLX_scores_rdd  = NFLX_groupby.rdd.map(tuple)
AMZN_scores_rdd  = AMZN_groupby.rdd.map(tuple)
AAPL_scores_rdd  = AAPL_groupby.rdd.map(tuple)

In [45]:
#Putting the "00:00:00" to convert it to a timestamp
NFLX_scores_time = NFLX_scores_rdd.map(lambda x: (x[0] + " 00:00:00", x[1]))
AMZN_scores_time = AMZN_scores_rdd.map(lambda x: (x[0] + " 00:00:00", x[1]))
AAPL_scores_time = AAPL_scores_rdd.map(lambda x: (x[0] + " 00:00:00", x[1]))

In [46]:
NFLX_scores_time.take(5)

[('2016-08-17 00:00:00', 13.309399999999998),
 ('2017-12-05 00:00:00', 4.066799999999999),
 ('2017-05-14 00:00:00', 3.9365),
 ('2019-08-08 00:00:00', 19.849600000000002),
 ('2019-08-22 00:00:00', 24.607599999999998)]

In [47]:
#Converting every stock back to a dataframe
NFLX_scores_time_df = NFLX_scores_time.map(lambda x: Row(**createDf(x))).toDF()
AMZN_scores_time_df = AMZN_scores_time.map(lambda x: Row(**createDf(x))).toDF()
AAPL_scores_time_df = AAPL_scores_time.map(lambda x: Row(**createDf(x))).toDF()

In [48]:
NFLX_scores_time_df.show(5)

+-------------------+------------------+
|                  0|                 1|
+-------------------+------------------+
|2016-08-17 00:00:00|13.309399999999998|
|2017-12-05 00:00:00| 4.066799999999999|
|2017-05-14 00:00:00|            3.9365|
|2019-08-08 00:00:00|19.849600000000002|
|2019-08-22 00:00:00|24.607599999999998|
+-------------------+------------------+
only showing top 5 rows



In [49]:
#Setting the column names back to the original ones
NFLX_scores_time_df = NFLX_scores_time_df.withColumnRenamed("0", "Date")
NFLX_scores_time_df = NFLX_scores_time_df.withColumnRenamed("1", "Sentiment_Score")

AMZN_scores_time_df = AMZN_scores_time_df.withColumnRenamed("0", "Date")
AMZN_scores_time_df = AMZN_scores_time_df.withColumnRenamed("1", "Sentiment_Score")

AAPL_scores_time_df = AAPL_scores_time_df.withColumnRenamed("0", "Date")
AAPL_scores_time_df = AAPL_scores_time_df.withColumnRenamed("1", "Sentiment_Score")


In [50]:
NFLX_scores_time_df.show(5)

+-------------------+------------------+
|               Date|   Sentiment_Score|
+-------------------+------------------+
|2016-08-17 00:00:00|13.309399999999998|
|2017-12-05 00:00:00| 4.066799999999999|
|2017-05-14 00:00:00|            3.9365|
|2019-08-08 00:00:00|19.849600000000002|
|2019-08-22 00:00:00|24.607599999999998|
+-------------------+------------------+
only showing top 5 rows



In [51]:
#Convert dataframes to a timestamp format
NFLX_scores_time_df = NFLX_scores_time_df.withColumn("Date", to_timestamp("Date", "yyyy-MM-dd HH:mm:ss"))
AMZN_scores_time_df = AMZN_scores_time_df.withColumn("Date", to_timestamp("Date", "yyyy-MM-dd HH:mm:ss"))
AAPL_scores_time_df = AAPL_scores_time_df.withColumn("Date", to_timestamp("Date", "yyyy-MM-dd HH:mm:ss"))

In [53]:
NFLX_scores_time_df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Sentiment_Score: double (nullable = true)



In [52]:
NFLX_scores_time_df.show(5)

+-------------------+------------------+
|               Date|   Sentiment_Score|
+-------------------+------------------+
|2016-08-17 00:00:00|13.309399999999998|
|2017-12-05 00:00:00| 4.066799999999999|
|2017-05-14 00:00:00|            3.9365|
|2019-08-08 00:00:00|19.849600000000002|
|2019-08-22 00:00:00|24.607599999999998|
+-------------------+------------------+
only showing top 5 rows



In [54]:
AMZN_scores_time_df.show(5)

+-------------------+------------------+
|               Date|   Sentiment_Score|
+-------------------+------------------+
|2017-05-14 00:00:00|10.664800000000001|
|2016-08-17 00:00:00|           19.5519|
|2017-12-05 00:00:00|38.957800000000006|
|2019-08-08 00:00:00|48.419000000000004|
|2019-08-22 00:00:00|60.159099999999995|
+-------------------+------------------+
only showing top 5 rows



In [55]:
AAPL_scores_time_df.show(5)

+-------------------+------------------+
|               Date|   Sentiment_Score|
+-------------------+------------------+
|2016-08-17 00:00:00|31.851999999999997|
|2017-05-14 00:00:00|           12.6721|
|2017-12-05 00:00:00|           48.6196|
|               null|            0.5859|
|2019-08-08 00:00:00|40.425200000000004|
+-------------------+------------------+
only showing top 5 rows

