In [51]:
from pandas import DataFrame as PandasDataFrame
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql import DataFrame as SparkDataFrame
from pyspark.sql import SparkSession
from nltk.sentiment import SentimentIntensityAnalyzer
import json


PRICE_FILE_PATH = 'BTC-USD_2017-2022.csv'
TWEETS_FILE_PATH = 'aggregated_tweet_sentiment.csv'


def create_spark_session() -> SparkSession:
    """Create Spark Session"""

    return (SparkSession
            .builder
            .appName('test')
            .master("local[*]")
            .enableHiveSupport()
            .getOrCreate()
           )


def read_stock_price_data(spark: SparkSession, file_path: str) -> SparkDataFrame:
    """Read stock price CSV file from Yahoo Finance"""

    schema = StructType([
        StructField("Date", TimestampType(), True),
        StructField("Open", DecimalType(10,2), True),
        StructField("High", DecimalType(10,2), True),
        StructField("Low", DecimalType(10,2), True),
        StructField("Close", DecimalType(10,2), True),
        StructField("Adj Close", DecimalType(10,2), True),
        StructField("Volume", LongType(), True),
    ])
    df = (spark.read.csv(PRICE_FILE_PATH, header=True, schema=schema)
        .withColumnRenamed("Adj Close", "Adj_Close")
        .withColumnRenamed("Date", "TradeDate")
        .withColumn("Ticker", F.lit("BTC"))
        )

    return df


def read_tweets_data(spark: SparkSession, file_path: str) -> SparkDataFrame:
    """Read raw tweets CSV file"""

    schema = StructType([
        StructField("id",StringType(),True),
        StructField("user",StringType(),True),
        StructField("fullname",StringType(),True),
        StructField("url",StringType(),True),
        StructField("timestamp",TimestampType(),True),
        StructField("replies",StringType(),True),
        StructField("likes",LongType(),True),
        StructField("retweets",LongType(),True),
        StructField("text",StringType(),True)]
    )
    tweets_df = (spark
        .read
        .option('delimiter', ';')
        .csv(file_path, header=True, schema=schema))
    
    # TODO: Remove Limit on Dataframe when running on cluster
    tweets_df_subset = tweets_df #.limit(50000)

    return (tweets_df_subset
        .withColumnRenamed("timestamp", "tweet_timestamp")
        .where('text IS NOT NULL'))


def add_price_to_SMA_ratio(df: SparkDataFrame,
                           window: int = 7) -> SparkDataFrame:
    """Add price to simple moving average ratio"""

    df.createOrReplaceTempView('add_price_to_SMA_ratio')
    df_transformed = spark.sql(f"""
        WITH cte1 AS
            (SELECT
            *,
            AVG(Adj_Close) OVER(
                PARTITION BY Ticker
                ORDER BY TradeDate ASC
                RANGE BETWEEN INTERVAL {window} DAYS PRECEDING AND CURRENT ROW) AS rolling_avg
            FROM add_price_to_SMA_ratio)

        SELECT
        *,
        ((Close / rolling_avg) - 1) AS price_to_SMA_ratio
        FROM cte1
        """)
    
    return df_transformed


def add_bollinger_bands(df: SparkDataFrame,
                        bollinger_window: int = 20,
                        bollinger_stdvs: int = 2) -> SparkDataFrame:
    """Add Bollinger Bands to dataframe"""

    df.createOrReplaceTempView('add_bollinger_bands')
    df_transformed = spark.sql(f"""
        WITH cte1 AS
            (SELECT
            *,
            AVG(Adj_Close) OVER(
                PARTITION BY Ticker
                ORDER BY TradeDate ASC
                RANGE BETWEEN INTERVAL {bollinger_window} DAYS PRECEDING AND CURRENT ROW) AS bollinger_rolling_avg,
            STDDEV(Adj_Close) OVER(
                PARTITION BY Ticker
                ORDER BY TradeDate ASC
                RANGE BETWEEN INTERVAL {bollinger_window} DAYS PRECEDING AND CURRENT ROW) AS bollinger_rolling_std
            FROM add_bollinger_bands)

        SELECT
        *,
        (bollinger_rolling_avg - (bollinger_rolling_std * {bollinger_stdvs})) AS bollinger_band_lower,
        (bollinger_rolling_avg + (bollinger_rolling_std * {bollinger_stdvs})) AS bollinger_band_upper
        FROM cte1
        """)
    
    return df_transformed.drop('bollinger_rolling_avg').drop('bollinger_rolling_std')


def add_stochastic_oscillator(df: SparkDataFrame, so_window: int = 14, sma_window: int = 3) -> SparkDataFrame:
    df.createOrReplaceTempView('add_stochastic_oscillator')
    df_transformed = spark.sql(f"""
        WITH cte1 AS
            (SELECT
            *,
            MAX(Adj_Close) OVER(
                PARTITION BY Ticker
                ORDER BY TradeDate ASC
                RANGE BETWEEN INTERVAL {so_window} DAYS PRECEDING AND CURRENT ROW) AS max_window_price,
            MIN(Adj_Close) OVER(
                PARTITION BY Ticker
                ORDER BY TradeDate ASC
                RANGE BETWEEN INTERVAL {so_window} DAYS PRECEDING AND CURRENT ROW) AS min_window_price
            FROM add_stochastic_oscillator),
        cte2 AS (SELECT
        *,
        (Adj_Close - min_window_price)/(max_window_price - min_window_price) * 100.0 AS stochastic_oscillator
        FROM cte1)
        
        SELECT
        *,
        AVG(stochastic_oscillator) OVER(
                PARTITION BY Ticker
                ORDER BY TradeDate ASC
                RANGE BETWEEN INTERVAL {sma_window} DAYS PRECEDING AND CURRENT ROW) AS stochastic_oscillator_sma
        FROM
        cte2
        """)

    return df_transformed.drop('max_window_price').drop('min_window_price').drop('stochastic_oscillator')


def add_on_balance_volume(df: SparkDataFrame) -> SparkDataFrame:
    """
        Add On Balance Volume
        https://www.investopedia.com/terms/o/onbalancevolume.asp
    """

    df.createOrReplaceTempView('add_on_balance_volume')
    df_transformed = spark.sql("""
        WITH cte1 AS
        (SELECT *,
                CASE
                    WHEN (Adj_Close - Open) > 0 THEN 1
                    WHEN (Adj_Close - Open) < 0 THEN -1
                    ELSE 0 END AS multiplier
        FROM
            add_on_balance_volume)

        SELECT
            *,
            SUM(Volume * multiplier) OVER(PARTITION BY Ticker ORDER BY TradeDate ASC) AS on_balance_volume
        FROM
            cte1
        """)
    
    return df_transformed


def add_momentum(df: SparkDataFrame, window: int = 14) -> SparkDataFrame:
    """
        Add Momentum
    """

    df.createOrReplaceTempView('add_momentum')
    df_transformed = spark.sql(f"""
        WITH cte1 AS
        (SELECT *,
                FIRST(Adj_Close) OVER(
                    PARTITION BY Ticker
                    ORDER BY TradeDate ASC
                    RANGE BETWEEN INTERVAL {window} DAYS PRECEDING AND CURRENT ROW) AS window_start_price,
                LAST(Adj_Close) OVER(
                    PARTITION BY Ticker
                    ORDER BY TradeDate ASC
                    RANGE BETWEEN INTERVAL {window} DAYS PRECEDING AND CURRENT ROW) AS window_end_price
        FROM
            add_momentum)

        SELECT
            *,
            (window_end_price/window_start_price) - 1 AS momentum
        FROM
            cte1
        """)
    
    return df_transformed #.drop('window_start_price').drop('window_end_price')


def add_MACD(df: PandasDataFrame) -> PandasDataFrame:
    """Moving Average Convergence Divergence"""
    
    price_col = 'Adj_Close'
    df['MACD_raw'] = df[[price_col]].ewm(span=12).mean() - df[[price_col]].ewm(span=26).mean()
    df['MACD_signal'] = df[['MACD_raw']].ewm(span=9).mean()
    df['MACD'] = (df['MACD_raw'] - df['MACD_signal'])
    df.drop(columns=['MACD_raw', 'MACD_signal'])
    
    return df


def run_pipeline():
    spark = create_spark_session()
    df_price_raw = read_stock_price_data(spark, PRICE_FILE_PATH)
    
    # transform price data to add indicators
    df_price = add_price_to_SMA_ratio(df_price_raw)
    df_price = add_bollinger_bands(df_price)
    df_price = add_stochastic_oscillator(df_price)
    df_price = add_on_balance_volume(df_price)
    df_price = add_momentum(df_price)
    
    df_tweets = read_tweets_data(spark, TWEETS_FILE_PATH)
    df_sentiment = analyze_tweet_sentiment(df_tweets)
    df_sentiment.printSchema()
    tweet_cols = ['tweet_timestamp',
                  'likes',
                  'retweets',
                  'negative',
                  'positive',
                  'neutral',
                  'compound']
#     df_sentiment.select(tweet_cols).show()
    df_agg_sentiment = aggregate_tweet_sentiment(df_sentiment)
#     df_agg_sentiment.show()
    
    df_joined = df_price.join(df_agg_sentiment, df_price.TradeDate == df_agg_sentiment.tweet_date, 'left')
#     df_joined.show()
    pandas_df = df_joined.toPandas()
    pandas_df = add_MACD(pandas_df).sort_values(by=['TradeDate'])
    print(pandas_df)
    pandas_df.to_csv('pipeline_export.csv')


if __name__ == "__main__":
    run_pipeline()

                                                                                

16892868
root
 |-- id: string (nullable = true)
 |-- user: string (nullable = true)
 |-- fullname: string (nullable = true)
 |-- url: string (nullable = true)
 |-- tweet_timestamp: timestamp (nullable = true)
 |-- replies: string (nullable = true)
 |-- likes: long (nullable = true)
 |-- retweets: long (nullable = true)
 |-- text: string (nullable = true)
 |-- tweets_analyzed: string (nullable = true)
 |-- negative: decimal(4,3) (nullable = true)
 |-- neutral: decimal(4,3) (nullable = true)
 |-- positive: decimal(4,3) (nullable = true)
 |-- compound: decimal(4,3) (nullable = true)



                                                                                

In [41]:
from IPython.display import display

# display(pandas_df[['stochastic_oscillator_sma']])
pandas_df['delta_price'] = pandas_df['Adj_Close'] - pandas_df['Open']
display(pandas_df)


Unnamed: 0,TradeDate,Open,High,Low,Close,Adj_Close,Volume,Ticker,rolling_avg,price_to_SMA_ratio,...,stochastic_oscillator,stochastic_oscillator_sma,tweet_date,pct_positive,pct_negative,pct_neutral,avg_compound_sentiment,tweet_volume,OBV,delta_price
0,2019-05-08,5849.48,5989.98,5794.72,5982.46,5982.46,15320605300,BTC,5732.740000,0.04356032194029382,...,100.000000000000000,6.368445,2019-05-08,0.025641,0.000000,0.974359,0.1641795,39,132.98,132.98
1,2018-09-01,7044.81,7242.29,7038.05,7193.25,7193.25,4116050000,BTC,6963.448750,0.03300106861560516,...,100.000000000000000,6.149448,2018-09-01,0.000000,0.000000,1.000000,0.4150000,1,148.44,148.44
2,2019-05-27,8674.07,8907.17,8668.71,8805.78,8805.78,27949839564,BTC,8127.808750,0.08341377988255445,...,100.000000000000000,7.019107,2019-05-27,0.062903,0.012903,0.919355,0.2633581,620,131.71,131.71
3,2019-04-28,5271.75,5326.23,5255.68,5285.14,5285.14,12819992056,BTC,5349.303750,-0.01199478530266673,...,43.152894606630000,6.599560,2019-04-28,0.000000,0.000000,1.000000,0.5860000,1,13.39,13.39
4,2019-03-17,4047.72,4054.12,4006.41,4025.23,4025.23,8221625400,BTC,3953.993750,0.01801627784565921,...,91.816693944354000,8.857481,2019-03-17,0.000000,0.000000,1.000000,0.8360000,1,-22.49,-22.49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,2019-03-31,4105.46,4113.02,4094.10,4105.40,4105.40,9045122443,BTC,4054.616250,0.01252492143985266,...,99.122501566961000,5.500496,2019-03-31,0.000000,0.000000,1.000000,0.9440000,1,-0.06,-0.06
62,2019-04-12,5061.20,5103.27,4955.85,5089.54,5089.54,13675206312,BTC,5158.588750,-0.01338520152435102,...,80.833972173743000,10.014781,2019-04-12,0.500000,0.000000,0.500000,0.6145000,2,28.34,28.34
63,2019-05-16,8194.50,8320.82,7729.61,7884.91,7884.91,33167197581,BTC,7328.742500,0.07588853067221287,...,88.138035253288000,5.930982,2019-05-16,0.000000,0.000000,1.000000,0.4356667,3,-309.59,-309.59
64,2019-04-02,4156.92,4905.95,4155.32,4879.88,4879.88,21315047816,BTC,4186.218750,0.16570114736598511,...,100.000000000000000,0.438749,2019-04-02,0.000000,0.000000,1.000000,0.2010000,1,722.96,722.96


In [52]:
df = spark.read.csv('/Users/salmanmukhi/code/Cryptopunks/cryptopunks_data_pipeline/aggregated_tweet_sentiment.csv', header=True)
df.schema

StructType(List(StructField(tweet_date,StringType,true),StructField(pct_positive,StringType,true),StructField(pct_negative,StringType,true),StructField(pct_neutral,StringType,true),StructField(avg_compound_sentiment,StringType,true),StructField(tweet_volume,StringType,true)))

In [9]:
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

sia.polarity_scores("this is awesome!")
# print(sia.polarity_scores("【毎日プレゼント企画】"))
# print(sia.polarity_scores('È appena uscito un nuovo video! LES CRYPTOMONNAIES QUI PULVÉRISENT BITCOIN EN 2019 https://t.co/yCsQMvRnyS'))

{'neg': 0.0, 'neu': 0.313, 'pos': 0.687, 'compound': 0.6588}

In [5]:
# Price to SMA Ratio
# Bollinger Bands
# Stochastic Oscillator
# MACD (moving average converging divergence)
# On balance volume
# Momentum


# RSI (Relative Strength Index)

In [25]:
df_price.select('TradeDate', 'Adj_Close', 'max_window_price', 'min_window_price', 'stochastic_oscillator').show(1000)

+-------------------+---------+----------------+----------------+---------------------+
|          TradeDate|Adj_Close|max_window_price|min_window_price|stochastic_oscillator|
+-------------------+---------+----------------+----------------+---------------------+
|2017-03-21 00:00:00|  1120.54|         1120.54|         1120.54|                 null|
|2017-03-22 00:00:00|  1049.14|         1120.54|         1049.14|                0E-15|
|2017-03-23 00:00:00|  1038.59|         1120.54|         1038.59|                0E-15|
|2017-03-24 00:00:00|   937.52|         1120.54|          937.52|                0E-15|
|2017-03-25 00:00:00|   972.78|         1120.54|          937.52|   19.265654026882000|
|2017-03-26 00:00:00|   966.72|         1120.54|          937.52|   15.954540487378000|
|2017-03-27 00:00:00|  1045.77|         1120.54|          937.52|   59.146541361600000|
|2017-03-28 00:00:00|  1047.15|         1120.54|          937.52|   59.900557316140000|
|2017-03-29 00:00:00|  1039.97| 

In [28]:
(1133.25 - 937.52)/(1143.81 - 937.52)

0.9488099277715839

In [54]:
import json

d = {'a': None, 'b': 1}
json_str = json.dumps(d)
json.loads(json_str)

{'a': None, 'b': 1}