In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Hadoop_Spark_Hive_Integration") \
    .config("spark.sql.catalogImplementation", "hive") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .config("spark.sql.warehouse.dir", "hdfs://namenode:9000/user/hive/warehouse") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [2]:
df = spark.sql("SELECT Date, Open, High, Low, Close, Volume FROM crypto_db.crypto_prices WHERE coin='Bitcoin'")

In [3]:
df = df.orderBy("Date")

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lag, avg
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [6]:
window = Window.orderBy("Date")
df = df.withColumn("prev_close", lag("Close",1).over(window))
df = df.withColumn("label", (col("Close") > col("prev_close")).cast("int"))
df = df.dropna()

In [7]:
w7 = Window.orderBy("Date").rowsBetween(-6,0)
w30 = Window.orderBy("Date").rowsBetween(-29,0)
df = df.withColumn("pct_change", ((col("Close")-col("prev_close"))/col("prev_close"))*100)
df = df.withColumn("ma7", avg("Close").over(w7))
df = df.withColumn("ma30", avg("Close").over(w30))
df = df.dropna()

In [8]:
assembler = VectorAssembler(inputCols=["Open","High","Low","Volume","pct_change","ma7","ma30"], outputCol="features")
df_vec = assembler.transform(df).select("Date","features","label")

In [9]:
cnt = df_vec.count()
train_cnt = int(cnt*0.8)
train_df = df_vec.limit(train_cnt)
test_df = df_vec.subtract(train_df)

In [10]:
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100, maxDepth=10)
model = rf.fit(train_df)
pred = model.transform(test_df)

In [11]:
eval_f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1_score = eval_f1.evaluate(pred)
print("F1 Score:", f1_score)

F1 Score: 0.9960886969979328


In [12]:
spark.stop()