In [1]:
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder \
    .appName("Hadoop_Spark_Hive_Integration") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .config("spark.sql.warehouse.dir", "/user/hive/warehouse") \
    .enableHiveSupport() \
    .getOrCreate()

# Verify the Spark session
spark.version


'3.2.1'

In [2]:
# List databases in Hive
spark.sql("SHOW DATABASES").show()

# Switch to the default Hive database
spark.sql("USE default")

# List tables in the database
spark.sql("SHOW TABLES").show()


+---------+
|namespace|
+---------+
|  default|
+---------+

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [3]:
# Create some "Hello World" data as an RDD
hello_data = [
    ("Hello World",),
    ("This is a test.",),
    ("Spark is awesome!",)
]

# Create a Spark DataFrame from the RDD
df = spark.createDataFrame(hello_data, ["value"])

# Show the DataFrame contents
df.show()

+-----------------+
|            value|
+-----------------+
|      Hello World|
|  This is a test.|
|Spark is awesome!|
+-----------------+



In [4]:
# Write the DataFrame to HDFS
df.write.text("hdfs://namenode:9000/user/hadoop/hello_world.txt")


In [5]:
# Read the text data back from HDFS
df_from_hdfs = spark.read.text("hdfs://namenode:9000/user/hadoop/hello_world.txt")

# Show the contents of the file
df_from_hdfs.show()


+-----------------+
|            value|
+-----------------+
|Spark is awesome!|
|  This is a test.|
|      Hello World|
+-----------------+



In [6]:
from pyspark.sql.functions import length

# Add a column with the length of each line
df_transformed = df_from_hdfs.withColumn("line_length", length(df_from_hdfs['value']))

# Show the transformed data
df_transformed.show()


+-----------------+-----------+
|            value|line_length|
+-----------------+-----------+
|Spark is awesome!|         17|
|  This is a test.|         15|
|      Hello World|         11|
+-----------------+-----------+



In [7]:
# Write the transformed data to a Hive table
df_transformed.write.mode("overwrite").saveAsTable("hello_world_table")

# Verify if the table was created successfully
spark.sql("SHOW TABLES").show()


+---------+-----------------+-----------+
|namespace|        tableName|isTemporary|
+---------+-----------------+-----------+
|  default|hello_world_table|      false|
+---------+-----------------+-----------+



In [10]:
from pyspark.sql.functions import concat_ws

# Concatenate 'value' and 'line_length' columns into a single column
df_single_column = df_transformed.withColumn("output", concat_ws(" ", df_transformed["value"], df_transformed["line_length"]))

# Write the single concatenated column to HDFS
df_single_column.select("output").write.text("hdfs://namenode:9000/user/hadoop/output_hello_world.txt")

# Read the data from HDFS
output_df = spark.read.text("hdfs://namenode:9000/user/hadoop/output_hello_world.txt")
output_df.show()


+--------------------+
|               value|
+--------------------+
|Spark is awesome! 17|
|  This is a test. 15|
|      Hello World 11|
+--------------------+

