In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Hadoop_Spark_Hive_Integration") \
    .config("spark.sql.catalogImplementation", "hive") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .config("spark.sql.warehouse.dir", "hdfs://namenode:9000/user/hive/warehouse") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

# Check databases
spark.sql("SHOW DATABASES").show()
spark.sql("SHOW TABLES").show()


+---------+
|namespace|
+---------+
|  default|
+---------+

+---------+----------+-----------+
|namespace| tableName|isTemporary|
+---------+----------+-----------+
|  default|test_table|      false|
+---------+----------+-----------+



In [2]:
df = spark.read.option("header", "true").csv("hdfs://namenode:9000/user/root/data_crawl/*.csv")


In [3]:
from pyspark.sql.functions import to_date, input_file_name, regexp_extract

df_cleaned = df.select(
    to_date("Date", "yyyy-MM-dd").alias("Date"),
    df["Open"].cast("double"),
    df["High"].cast("double"),
    df["Low"].cast("double"),
    df["Close"].cast("double"),
    df["Volume"].cast("double"),
    input_file_name().alias("source_file")
)


In [4]:
df_cleaned = df_cleaned.withColumn("coin", regexp_extract("source_file", r"([^/]+)\.csv", 1))


In [5]:
df_cleaned = df_cleaned.na.drop(subset=["Date", "Open", "High", "Low", "Close", "Volume"])


In [6]:
df_cleaned.printSchema()


root
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: double (nullable = true)
 |-- source_file: string (nullable = false)
 |-- coin: string (nullable = false)



In [7]:
spark.sql("CREATE DATABASE IF NOT EXISTS crypto_db")


DataFrame[]

In [8]:
spark.sql("""
CREATE TABLE IF NOT EXISTS crypto_db.crypto_prices (
    Date DATE,
    Open DOUBLE,
    High DOUBLE,
    Low DOUBLE,
    Close DOUBLE,
    Volume DOUBLE,
    coin STRING
)
STORED AS PARQUET
""")


DataFrame[]

In [9]:
df_cleaned = df_cleaned.select("Date", "Open", "High", "Low", "Close", "Volume", "coin")


In [10]:
df_cleaned.write.mode("overwrite").insertInto("crypto_db.crypto_prices")


In [11]:
spark.sql("SELECT * FROM crypto_db.crypto_prices LIMIT 10").show()


+----------+-----------------+------------------+-----------------+-----------------+---------+--------+
|      Date|             Open|              High|              Low|            Close|   Volume|    coin|
+----------+-----------------+------------------+-----------------+-----------------+---------+--------+
|2014-09-17| 5.08588981628418| 5.170770168304443|4.965950012207031|5.058549880981445|3071840.0|Litecoin|
|2014-09-18|5.065430164337158| 5.065430164337158|4.579959869384766|4.685229778289795|4569260.0|Litecoin|
|2014-09-19|4.687290191650391| 4.755819797515869|4.254350185394287|4.327770233154297|3917450.0|Litecoin|
|2014-09-20| 4.32919979095459| 4.616079807281494|4.202189922332764|4.286439895629883|5490660.0|Litecoin|
|2014-09-21|4.263070106506348|4.3001298904418945|4.154990196228027|4.245920181274414|2931220.0|Litecoin|
|2014-09-22|4.245930194854736| 4.416880130767822|4.210130214691162|4.242350101470947|1855960.0|Litecoin|
|2014-09-23|   4.239990234375| 4.881350040435791|4.1888