In [13]:
from pandas import DataFrame as PandasDataFrame
from pyspark.sql.types import TimestampType, DecimalType, LongType, StructType, StructField, MapType, StringType, DoubleType
from pyspark.sql import functions as F
from pyspark.sql import DataFrame as SparkDataFrame
from pyspark.sql import SparkSession
from nltk.sentiment import SentimentIntensityAnalyzer
import json


PRICE_FILE_PATH = '/Users/salmanmukhi/Downloads/BTC-USD_2017-2022.csv'
TWEETS_FILE_PATH = '/Users/salmanmukhi/Downloads/BTC_tweets.csv'


def create_spark_session():
    """Create Spark Session"""

    return (SparkSession
            .builder
            .appName('test')
            .master("local[*]")
            .enableHiveSupport()
            .getOrCreate()
           )


def read_stock_price_data(spark: SparkSession, file_path: str) -> SparkDataFrame:
    """Read stock price CSV file from Yahoo Finance"""

    schema = StructType([
        StructField("Date", TimestampType(), True),
        StructField("Open", DecimalType(10,2), True),
        StructField("High", DecimalType(10,2), True),
        StructField("Low", DecimalType(10,2), True),
        StructField("Close", DecimalType(10,2), True),
        StructField("Adj Close", DecimalType(10,2), True),
        StructField("Volume", LongType(), True),
    ])
    df = (spark.read.csv(PRICE_FILE_PATH, header=True, schema=schema)
        .withColumnRenamed("Adj Close", "Adj_Close")
        .withColumnRenamed("Date", "TradeDate")
        .withColumn("Ticker", F.lit("BTC"))
        )

    return df


def read_tweets_data(spark: SparkSession, file_path: str) -> SparkDataFrame:
    """Read raw tweets CSV file"""

    schema = StructType([
        StructField("id",StringType(),True),
        StructField("user",StringType(),True),
        StructField("fullname",StringType(),True),
        StructField("url",StringType(),True),
        StructField("timestamp",TimestampType(),True),
        StructField("replies",StringType(),True),
        StructField("likes",LongType(),True),
        StructField("retweets",LongType(),True),
        StructField("text",StringType(),True)]
    )
    tweets_df = (spark
        .read
        .option('delimiter', ';')
        .csv(file_path, header=True, schema=schema))
    
    # TODO: Remove Limit on Dataframe when running on cluster
    tweets_df_subset = tweets_df.limit(50000)

    return (tweets_df_subset
        .withColumnRenamed("timestamp", "tweet_timestamp")
        .where('text IS NOT NULL'))


def add_price_to_SMA_ratio(df: SparkDataFrame,
                           window: int = 7) -> SparkDataFrame:
    """Add price to simple moving average ratio"""

    df.createOrReplaceTempView('add_price_to_SMA_ratio')
    df_transformed = spark.sql(f"""
        WITH cte1 AS
            (SELECT
            *,
            AVG(Adj_Close) OVER(
                PARTITION BY Ticker
                ORDER BY TradeDate ASC
                RANGE BETWEEN INTERVAL {window} DAYS PRECEDING AND CURRENT ROW) AS rolling_avg
            FROM add_price_to_SMA_ratio)

        SELECT
        *,
        ((Close / rolling_avg) - 1) AS price_to_SMA_ratio
        FROM cte1
        """)
    
    return df_transformed


def add_bollinger_bands(df: SparkDataFrame,
                        bollinger_window: int = 20,
                        bollinger_stdvs: int = 2) -> SparkDataFrame:
    """Add Bollinger Bands to dataframe"""

    df.createOrReplaceTempView('add_bollinger_bands')
    df_transformed = spark.sql(f"""
        WITH cte1 AS
            (SELECT
            *,
            AVG(Adj_Close) OVER(
                PARTITION BY Ticker
                ORDER BY TradeDate ASC
                RANGE BETWEEN INTERVAL {bollinger_window} DAYS PRECEDING AND CURRENT ROW) AS bollinger_rolling_avg,
            STDDEV(Adj_Close) OVER(
                PARTITION BY Ticker
                ORDER BY TradeDate ASC
                RANGE BETWEEN INTERVAL {bollinger_window} DAYS PRECEDING AND CURRENT ROW) AS bollinger_rolling_std
            FROM add_bollinger_bands)

        SELECT
        *,
        (bollinger_rolling_avg - (bollinger_rolling_std * {bollinger_stdvs})) AS bollinger_band_lower,
        (bollinger_rolling_avg + (bollinger_rolling_std * {bollinger_stdvs})) AS bollinger_band_upper
        FROM cte1
        """)
    
    return df_transformed.drop('bollinger_rolling_avg').drop('bollinger_rolling_std')


def add_stochastic_oscillator(df: SparkDataFrame, window: int = 14) -> SparkDataFrame:
    df.createOrReplaceTempView('add_stochastic_oscillator')
    df_transformed = spark.sql(f"""
        WITH cte1 AS
            (SELECT
            *,
            MAX(Adj_Close) OVER(
                PARTITION BY Ticker
                ORDER BY TradeDate ASC
                RANGE BETWEEN INTERVAL {window} DAYS PRECEDING AND CURRENT ROW) AS max_window_price,
            MIN(Adj_Close) OVER(
                PARTITION BY Ticker
                ORDER BY TradeDate ASC
                RANGE BETWEEN INTERVAL {window} DAYS PRECEDING AND CURRENT ROW) AS min_window_price
            FROM add_stochastic_oscillator)

        SELECT
        *,
        (Adj_Close - min_window_price)/(max_window_price - min_window_price) * 100.0 AS stochastic_oscillator
        FROM cte1
        """)

    return df_transformed.drop('max_window_price').drop('min_window_price')


def analyze_tweet_sentiment(tweets_df: SparkDataFrame) -> SparkDataFrame:
    """
        Use NLTK's pretrained SentimentIntensityAnalyzer to measure the sentiment
        of each tweet. Filter out tweets such as those in different languages that will default to a score
        of neutral=1.0.
    """
    
    sia = SentimentIntensityAnalyzer()

    # Wrap function in UDF
    def get_tweet_sentiment(tweet: str) -> str:
        sentiment_score = sia.polarity_scores(tweet)
        return json.dumps(sentiment_score)

    sentiment_udf = F.udf(get_tweet_sentiment, StringType())
    # sentiment_udf = F.udf(get_tweet_sentiment, MapType(StringType(), DoubleType()))
    return (tweets_df
        .withColumn('tweets_analyzed', sentiment_udf('text'))
        .select("*",
            F.get_json_object('tweets_analyzed', '$.neg').cast(DecimalType(4,3)).alias('negative'),
            F.get_json_object('tweets_analyzed', '$.neu').cast(DecimalType(4,3)).alias('neutral'),
            F.get_json_object('tweets_analyzed', '$.pos').cast(DecimalType(4,3)).alias('positive'),
            F.get_json_object('tweets_analyzed', '$.compound').cast(DecimalType(4,3)).alias('compound')
        )
        .where('neutral != 1.0')
    )


def aggregate_tweet_sentiment(df: SparkDataFrame) -> SparkDataFrame:
    """Aggregate sentiment data from individual tweets to a daily level of granularity"""

    df.createOrReplaceTempView('tweet_sentiment_unagg')
    df_agg = spark.sql("""
    WITH tweet_sentiment_labaled AS
        (SELECT
            CAST(tweet_timestamp AS Date) AS tweet_date,
            compound,
            CASE
                WHEN negative > positive
                    AND negative > neutral
                    THEN 'negative'
                WHEN positive > negative
                    AND positive > neutral
                    THEN 'positive'
                WHEN neutral > positive
                    AND neutral > negative
                    THEN 'neutral'
                END AS overall_sentiment
        FROM
            tweet_sentiment_unagg)
    
    SELECT
        tweet_date,
        AVG(CASE WHEN overall_sentiment = 'positive' THEN 1 ELSE 0 END) AS pct_positive,
        AVG(CASE WHEN overall_sentiment = 'negative' THEN 1 ELSE 0 END) AS pct_negative,
        AVG(CASE WHEN overall_sentiment = 'neutral' THEN 1 ELSE 0 END) AS pct_neutral,
        AVG(compound) AS avg_compound_sentiment,
        COUNT(*) AS tweet_volume
    FROM
        tweet_sentiment_labaled
    GROUP BY tweet_date
    """)
    
    return df_agg



if __name__ == "__main__":
    spark = create_spark_session()
    df_price_raw = read_stock_price_data(spark, PRICE_FILE_PATH)
    
    # transform price data to add indicators
    df_price = add_price_to_SMA_ratio(df_price_raw)
    df_price = add_bollinger_bands(df_price)
    df_price = add_stochastic_oscillator(df_price)
    
    df_tweets = read_tweets_data(spark, TWEETS_FILE_PATH)
    df_sentiment = analyze_tweet_sentiment(df_tweets)
    df_sentiment.printSchema()
    tweet_cols = ['tweet_timestamp',
                  'likes',
                  'retweets',
                  'negative',
                  'positive',
                  'neutral',
                  'compound']
    df_sentiment.select(tweet_cols).show()
    df_agg_sentiment = aggregate_tweet_sentiment(df_sentiment)
    df_agg_sentiment.show()
    
    df_joined = df_price.join(df_agg_sentiment, df_price.TradeDate == df_agg_sentiment.tweet_date, 'inner')
    df_joined.show()
    pandas_df = df_joined.toPandas()


root
 |-- id: string (nullable = true)
 |-- user: string (nullable = true)
 |-- fullname: string (nullable = true)
 |-- url: string (nullable = true)
 |-- tweet_timestamp: timestamp (nullable = true)
 |-- replies: string (nullable = true)
 |-- likes: long (nullable = true)
 |-- retweets: long (nullable = true)
 |-- text: string (nullable = true)
 |-- tweets_analyzed: string (nullable = true)
 |-- negative: decimal(4,3) (nullable = true)
 |-- neutral: decimal(4,3) (nullable = true)
 |-- positive: decimal(4,3) (nullable = true)
 |-- compound: decimal(4,3) (nullable = true)



                                                                                

+-------------------+-----+--------+--------+--------+-------+--------+
|    tweet_timestamp|likes|retweets|negative|positive|neutral|compound|
+-------------------+-----+--------+--------+--------+-------+--------+
|2019-05-27 06:49:18|    0|       0|   0.069|   0.000|  0.931|  -0.103|
|2019-05-27 06:49:23|    0|       0|   0.000|   0.217|  0.783|   0.361|
|2019-05-27 06:49:27|    0|       0|   0.085|   0.000|  0.915|  -0.103|
|2019-05-27 06:49:32|    0|       0|   0.000|   0.195|  0.805|   0.742|
|2019-05-27 06:49:19|   14|       2|   0.000|   0.139|  0.861|   0.440|
|2019-05-21 11:49:45|   81|      84|   0.000|   0.493|  0.507|   0.597|
|2019-05-10 09:06:01|    0|       0|   0.000|   1.000|  0.000|   0.296|
|2019-05-27 06:49:17|    1|       1|   0.108|   0.108|  0.785|   0.000|
|2019-05-27 06:49:38|    0|       0|   0.000|   0.138|  0.862|   0.340|
|2019-05-27 06:49:46|    0|       0|   0.059|   0.121|  0.819|   0.296|
|2019-05-27 06:49:47|    0|       0|   0.000|   0.194|  0.806|  

                                                                                

+----------+-------------------+--------------------+------------------+----------------------+------------+
|tweet_date|       pct_positive|        pct_negative|       pct_neutral|avg_compound_sentiment|tweet_volume|
+----------+-------------------+--------------------+------------------+----------------------+------------+
|2019-05-08|0.02564102564102564|                 0.0|0.9743589743589743|             0.1641795|          39|
|2018-09-01|                0.0|                 0.0|               1.0|             0.4150000|           1|
|2019-05-27|0.06290322580645161|0.012903225806451613|0.9193548387096774|             0.2633581|         620|
|2017-01-27|                0.0|                 0.0|               1.0|            -0.1280000|           1|
|2019-04-28|                0.0|                 0.0|               1.0|             0.5860000|           1|
|2019-03-17|                0.0|                 0.0|               1.0|             0.8360000|           1|
|2019-05-14|       

                                                                                

+-------------------+-------+-------+-------+-------+---------+-----------+------+-----------+--------------------+--------------------+--------------------+----------+-------------------+--------------------+------------------+----------------------+------------+
|          TradeDate|   Open|   High|    Low|  Close|Adj_Close|     Volume|Ticker|rolling_avg|  price_to_SMA_ratio|bollinger_band_lower|bollinger_band_upper|tweet_date|       pct_positive|        pct_negative|       pct_neutral|avg_compound_sentiment|tweet_volume|
+-------------------+-------+-------+-------+-------+---------+-----------+------+-----------+--------------------+--------------------+--------------------+----------+-------------------+--------------------+------------------+----------------------+------------+
|2019-05-08 00:00:00|5849.48|5989.98|5794.72|5982.46|  5982.46|15320605300|   BTC|5732.740000| 0.04356032194029382|   5005.605985593211|    5965.30068040679|2019-05-08|0.02564102564102564|                 

                                                                                

In [15]:
from IPython.display import display

display(pandas_df)

Unnamed: 0,TradeDate,Open,High,Low,Close,Adj_Close,Volume,Ticker,rolling_avg,price_to_SMA_ratio,bollinger_band_lower,bollinger_band_upper,tweet_date,pct_positive,pct_negative,pct_neutral,avg_compound_sentiment,tweet_volume
0,2019-05-08,5849.48,5989.98,5794.72,5982.46,5982.46,15320605300,BTC,5732.740000,0.04356032194029382,5005.605986,5965.300680,2019-05-08,0.025641,0.000000,0.974359,0.1641795,39
1,2018-09-01,7044.81,7242.29,7038.05,7193.25,7193.25,4116050000,BTC,6963.448750,0.03300106861560516,6003.083592,7245.498312,2018-09-01,0.000000,0.000000,1.000000,0.4150000,1
2,2019-05-27,8674.07,8907.17,8668.71,8805.78,8805.78,27949839564,BTC,8127.808750,0.08341377988255445,5849.246593,9224.769597,2019-05-27,0.062903,0.012903,0.919355,0.2633581,620
3,2019-04-28,5271.75,5326.23,5255.68,5285.14,5285.14,12819992056,BTC,5349.303750,-0.01199478530266673,5008.950478,5517.406664,2019-04-28,0.000000,0.000000,1.000000,0.5860000,1
4,2019-03-17,4047.72,4054.12,4006.41,4025.23,4025.23,8221625400,BTC,3953.993750,0.01801627784565921,3777.154274,4030.829536,2019-03-17,0.000000,0.000000,1.000000,0.8360000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,2019-03-31,4105.46,4113.02,4094.10,4105.40,4105.40,9045122443,BTC,4054.616250,0.01252492143985266,3883.463489,4154.402225,2019-03-31,0.000000,0.000000,1.000000,0.9440000,1
62,2019-04-12,5061.20,5103.27,4955.85,5089.54,5089.54,13675206312,BTC,5158.588750,-0.01338520152435102,3524.688860,5682.487330,2019-04-12,0.500000,0.000000,0.500000,0.6145000,2
63,2019-05-16,8194.50,8320.82,7729.61,7884.91,7884.91,33167197581,BTC,7328.742500,0.07588853067221287,4210.704395,8258.126081,2019-05-16,0.000000,0.000000,1.000000,0.4356667,3
64,2019-04-02,4156.92,4905.95,4155.32,4879.88,4879.88,21315047816,BTC,4186.218750,0.16570114736598511,3688.333258,4466.072456,2019-04-02,0.000000,0.000000,1.000000,0.2010000,1


In [9]:
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

sia.polarity_scores("this is awesome!")
# print(sia.polarity_scores("【毎日プレゼント企画】"))
# print(sia.polarity_scores('È appena uscito un nuovo video! LES CRYPTOMONNAIES QUI PULVÉRISENT BITCOIN EN 2019 https://t.co/yCsQMvRnyS'))

{'neg': 0.0, 'neu': 0.313, 'pos': 0.687, 'compound': 0.6588}

In [5]:
# Price to SMA Ratio
# MACD (moving average converging divergence)
# Bollinger Bands
# RSI (Relative Strength Index)
# Stochastic Oscillator