In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, regexp_replace, sum, when, sum as spark_sum, avg, rank, desc
from pymongo import MongoClient
from pyspark.sql.window import Window
import logging

In [2]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Most popular product categories among different age groups") \
    .config("spark.executor.memory", "1g") \
    .getOrCreate()

In [4]:
# Define the HDFS directories
customer_data_dir = "hdfs://namenode:9000/csv_files/customers/"



In [5]:

# Create a static DataFrame to infer the schema for customer data
try:
    customer_static_df = spark.read \
        .format("csv") \
        .option("header", "true") \
        .load(customer_data_dir)
    customer_static_df.printSchema()
except Exception as e:
    logger.error(f"Error reading static data: {e}")
    spark.stop()
    raise

root
 |-- customer_id: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- Item_Purchased: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Purchase_Amount_USD: string (nullable = true)
 |-- location: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Season: string (nullable = true)
 |-- Review Rating: string (nullable = true)
 |-- Subscription Status: string (nullable = true)
 |-- Shipping Type: string (nullable = true)
 |-- Discount Applied: string (nullable = true)
 |-- Promo Code Used: string (nullable = true)
 |-- Previous Purchases: string (nullable = true)
 |-- Payment Method: string (nullable = true)
 |-- Frequency of Purchases: string (nullable = true)



In [6]:
# Create a streaming DataFrame for customer data
try:
    customer_streaming_df = spark.readStream \
        .schema(customer_static_df.schema) \
        .option("header", "true") \
        .csv(customer_data_dir)
except Exception as e:
    logger.error(f"Error creating streaming DataFrame: {e}")
    spark.stop()
    raise

In [8]:
# MongoDB connection setup
try:
    #client = MongoClient(mongo_host, mongo_port)
    client = MongoClient("mongodb://mongodb:27017")
    db = client['Products']
    coll_popular_categories = db['popular_categories']
except Exception as e:
    logger.error(f"Error connecting to MongoDB: {e}")
    spark.stop()
    raise

In [9]:


def find_popular_categories(batch_df, batch_id):
   
   try:
        batch_df = batch_df.withColumn("Age_Group",
            when(col("age") < 18, "Under 18")
            .when((col("age") >= 18) & (col("age") < 30), "18-29")
            .when((col("age") >= 30) & (col("age") < 40), "30-39")
            .when((col("age") >= 40) & (col("age") < 50), "40-49")
            .when((col("age") >= 50) & (col("age") < 60), "50-59")
            .otherwise("60+"))
    
    # Group by Age_Group and Category and calculate total purchase amount
        category_popularity = batch_df.groupBy("Age_Group", "Category").agg(
         spark_sum("Purchase_Amount_USD").alias("TotalPurchaseAmount"),
         count("Item_Purchased").alias("PurchaseCount")
        )
    
    # Find the most popular categories in each age group by total purchase amount
        window_spec = Window.partitionBy("Age_Group").orderBy(col("TotalPurchaseAmount").desc())
        popular_categories = category_popularity.withColumn("rank", rank().over(window_spec)).filter(col("rank") == 1)
        popular_categories_sorted = popular_categories.orderBy("TotalPurchaseAmount", ascending=False)
    
    # Convert the DataFrame to a list of dictionaries
        popular_categories_list = [row.asDict() for row in popular_categories.collect()]
    
         # Insert into MongoDB if the list is not empty
        if popular_categories_list:
          coll_popular_categories.insert_many(popular_categories_list)
          popular_categories_sorted.show()
          logger.info(f"Batch {batch_id} processed and inserted into MongoDB")
        else:
          logger.info(f"Batch {batch_id} processed but no high-value customers found")
    # Insert into MongoDB (assuming you have established a connection)
    # Replace 'db.high_value_customer_habits' with your MongoDB collection
  
   except Exception as e:
        logger.error(f"Error processing batch {batch_id}: {e}")
              

In [None]:
# Write the streaming DataFrame to the console using the function
try:
    # Write the streaming DataFrame to the console using the function
  query_popular_categories = customer_streaming_df.writeStream \
   .outputMode("append") \
   .foreachBatch(find_popular_categories) \
   .start()
    # Await termination
  query_popular_categories.awaitTermination()
except Exception as e:
    logger.error(f"Error in streaming query: {e}")
finally:
    # Stop Spark session
    spark.stop()


INFO:py4j.java_gateway:Callback Server Starting
INFO:py4j.java_gateway:Socket listening on ('127.0.0.1', 37639)
INFO:py4j.clientserver:Python Server ready to receive messages
INFO:py4j.clientserver:Received command c on object id p0
ERROR:__main__:Error processing batch 0: name 'insights_list' is not defined
