In [42]:
import findspark
findspark.init()
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import datetime
import time
import numpy as np
import pandas as pd
import talib
kafka_topic_name = "OHLC_DATA"
kafka_bootstrap_servers = 'localhost:9092'

In [2]:
spark = SparkSession \
    .builder \
    .appName("Structured Streaming ") \
    .master("local[*]") \
    .getOrCreate()


In [3]:
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
    .option("subscribe", kafka_topic_name) \
    .option("startingOffsets", "latest") \
    .load().selectExpr("CAST(value AS STRING)", "timestamp")


In [4]:
stock_data_schema = "timestamp STRING, open float, high Float, low Float, close Float"
df2 = df \
    .select(from_csv(col("value"), stock_data_schema)
            .alias("stock"))
df2 = df2.select("stock.*")
df2 = (df2.withColumn('timestamp',regexp_replace('timestamp',"'",""))
          .withColumn('timestamp',to_timestamp(col('timestamp'))))


df2.createOrReplaceTempView('ohlc_data')
all_data = spark.sql("SELECT * FROM ohlc_data")
stock_data_write_stream = all_data.writeStream \
    .trigger(processingTime='1 seconds') \
    .outputMode("append") \
    .option("truncate", "false") \
    .format("memory") \
    .queryName("ohlcData") \
    .start()


In [56]:
spark.sql('select * from ohlcData').show(200)

+-------------------+--------+--------+--------+--------+
|          timestamp|    open|    high|     low|   close|
+-------------------+--------+--------+--------+--------+
|2022-01-06 12:48:00|37280.85| 37301.7|37278.95|37300.15|
|2022-01-06 12:49:00|37300.25| 37313.0|37298.25|37301.25|
|2022-01-06 12:50:00|37301.55|37315.25|37300.75|37304.15|
|2022-01-06 12:51:00|37302.05| 37311.2|37302.05|37308.45|
|2022-01-06 12:52:00| 37309.7| 37317.2| 37306.1| 37306.5|
|2022-01-06 12:53:00|37305.85| 37309.0|37294.75|37298.45|
|2022-01-06 12:54:00|37295.95|37318.45|37295.95| 37313.3|
|2022-01-06 12:55:00| 37316.6| 37322.4|37307.25| 37311.5|
|2022-01-06 12:56:00|37311.05| 37330.8|37311.05| 37330.8|
|2022-01-06 12:57:00|37330.85| 37338.6|37318.15|37318.15|
|2022-01-06 12:58:00| 37317.6|37326.35|37314.95| 37323.4|
|2022-01-06 12:59:00|37329.55| 37330.6| 37319.6| 37325.6|
|2022-01-06 13:00:00|37327.55| 37349.5|37327.55|37340.85|
|2022-01-06 13:01:00|37342.65| 37347.3| 37329.3|37329.65|
|2022-01-06 13

In [None]:
def rsi(df):
    n=14
    "function to calculate RSI"
    delta = df["close"].diff().dropna()
    u = delta * 0
    d = u.copy()
    u[delta > 0] = delta[delta > 0]
    d[delta < 0] = -delta[delta < 0]
    u[u.index[n-1]] = np.mean( u[:n]) # first value is average of gains
    u = u.drop(u.index[:(n-1)])
    d[d.index[n-1]] = np.mean( d[:n]) # first value is average of losses
    d = d.drop(d.index[:(n-1)])
    rs = u.ewm(com=n,min_periods=n).mean()/d.ewm(com=n,min_periods=n).mean()
    return 100 - 100 / (1+rs)

In [47]:
def rsi_tradingview(ohlc, period: int = 14, round_rsi: bool = True):
    

    delta = ohlc.diff()

    up = delta.copy()
    up[up < 0] = 0
    up = pd.Series.ewm(up, alpha=1/period).mean()

    down = delta.copy()
    down[down > 0] = 0
    down *= -1
    down = pd.Series.ewm(down, alpha=1/period).mean()

    rsi = np.where(up == 0, 0, np.where(down == 0, 100, 100 - (100 / (1 + up / down))))

    return pd.DataFrame(np.round(rsi, 2)) if round_rsi else rsi

In [58]:
initial_time = spark.sql('select * from ohlcData').take(1)[0]['timestamp']

In [63]:
 initial_time=initial_time+datetime.timedelta(hours=2)

In [None]:
initial_time

In [None]:
spark.sql(f"select * from ohlcData where timestamp >='{initial_time}' limit 15")

## RSi

In [None]:
while True:
    ohlc_data = spark.sql(f"select * from ohlcData where timestamp >='{initial_time}' limit 15")
    ohlc_data_pd = ohlc_data.toPandas().set_index('timestamp')
    if(len(ohlc_data_pd) < 15):
        #print('if continue')
        continue
    else:
        print('else run')
        print(rsi(ohlc_data_pd.iloc[:15]).tail(1).to_string())
        initial_time+= datetime.timedelta(minutes=1)


else run
timestamp
2022-01-06 15:06:00   NaN
else run
timestamp
2022-01-06 15:07:00   NaN
else run
timestamp
2022-01-06 15:08:00   NaN
else run
timestamp
2022-01-06 15:09:00   NaN
else run
timestamp
2022-01-06 15:10:00   NaN
else run
timestamp
2022-01-06 15:11:00   NaN
else run
timestamp
2022-01-06 15:12:00   NaN
else run
timestamp
2022-01-06 15:13:00   NaN
else run
timestamp
2022-01-06 15:14:00   NaN
else run
timestamp
2022-01-06 15:15:00   NaN
else run
timestamp
2022-01-06 15:16:00   NaN
else run
timestamp
2022-01-06 15:17:00   NaN
else run
Series([], )
else run
timestamp
2022-01-06 15:18:00   NaN
else run
Series([], )
else run
timestamp
2022-01-06 15:19:00   NaN
else run
Series([], )
else run
timestamp
2022-01-06 15:20:00   NaN
else run
Series([], )
else run
timestamp
2022-01-06 15:21:00   NaN
else run
Series([], )
else run
timestamp
2022-01-06 15:22:00   NaN
else run
Series([], )
else run
timestamp
2022-01-06 15:23:00   NaN
else run
Series([], )
else run
timestamp
2022-01-06 15:24:

## rsi tradingview

In [66]:
while True:
    ohlc_data = spark.sql(f"select * from ohlcData where timestamp >='{initial_time}' limit 15")
    ohlc_data_pd = ohlc_data.toPandas().set_index('timestamp')
    if(len(ohlc_data_pd) < 15):
        continue
    else:
        print(rsi_tradingview(ohlc_data_pd["close"].iloc[:15]).tail(1).to_string())
        initial_time+= datetime.timedelta(minutes=1)


        0
14  31.73
        0
14  26.34
       0
14  35.4
        0
14  31.94


KeyboardInterrupt: 

## Talib

In [35]:
while True:
    ohlc_data = spark.sql(f"select * from ohlcData where timestamp >='{initial_time}' limit 15")
    ohlc_data_pd = ohlc_data.toPandas().set_index('timestamp')
    if(len(ohlc_data_pd) < 15):
        continue
    else:
        print(talib.RSI(ohlc_data_pd["close"].iloc[:15]).tail(1).to_string())
        initial_time+= datetime.timedelta(minutes=1)


timestamp
2022-01-06 14:06:00    42.252775
timestamp
2022-01-06 14:07:00    32.679363
timestamp
2022-01-06 14:08:00    33.474786
timestamp
2022-01-06 14:09:00    48.571034
timestamp
2022-01-06 14:10:00    51.402688
timestamp
2022-01-06 14:11:00    61.316488
timestamp
2022-01-06 14:12:00    71.697003
timestamp
2022-01-06 14:13:00    73.230757
timestamp
2022-01-06 14:14:00    67.236921
timestamp
2022-01-06 14:15:00    68.295073
timestamp
2022-01-06 14:16:00    76.560113
timestamp
2022-01-06 14:17:00    71.881185
timestamp
2022-01-06 14:18:00    72.781651
timestamp
2022-01-06 14:19:00    75.924539
timestamp
2022-01-06 14:20:00    78.724057
timestamp
2022-01-06 14:21:00    85.088286
timestamp
2022-01-06 14:22:00    85.123766
timestamp
2022-01-06 14:23:00    83.869012
timestamp
2022-01-06 14:24:00    91.742479
timestamp
2022-01-06 14:25:00    92.63694
timestamp
2022-01-06 14:26:00    91.754821
timestamp
2022-01-06 14:27:00    86.208231
timestamp
2022-01-06 14:28:00    87.710876
timestamp
20

KeyboardInterrupt: 