In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, regexp_replace, sum, when, sum as spark_sum, avg, rank, desc
from pymongo import MongoClient
#from pyspark.sql.window import Window
import logging

In [2]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Real-time purchasing habits of high-value customers") \
    .config("spark.executor.memory", "1g") \
    .getOrCreate()

In [4]:
# Define the HDFS directories
customer_data_dir = "hdfs://namenode:9000/csv_files/customers/"
product_data_dir = "hdfs://namenode:9000/csv_files/products/"


In [5]:

# Create a static DataFrame to infer the schema for customer data
try:
    customer_static_df = spark.read \
        .format("csv") \
        .option("header", "true") \
        .load(customer_data_dir)
    customer_static_df.printSchema()
except Exception as e:
    logger.error(f"Error reading static data: {e}")
    spark.stop()
    raise

root
 |-- customer_id: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- Item_Purchased: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Purchase_Amount_USD: string (nullable = true)
 |-- location: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Season: string (nullable = true)
 |-- Review Rating: string (nullable = true)
 |-- Subscription Status: string (nullable = true)
 |-- Shipping Type: string (nullable = true)
 |-- Discount Applied: string (nullable = true)
 |-- Promo Code Used: string (nullable = true)
 |-- Previous Purchases: string (nullable = true)
 |-- Payment Method: string (nullable = true)
 |-- Frequency of Purchases: string (nullable = true)



In [6]:
# Create a streaming DataFrame for customer data
try:
    customer_streaming_df = spark.readStream \
        .schema(customer_static_df.schema) \
        .option("header", "true") \
        .csv(customer_data_dir)
except Exception as e:
    logger.error(f"Error creating streaming DataFrame: {e}")
    spark.stop()
    raise

In [7]:

# Create MongoClient instance


In [8]:
# MongoDB connection setup
try:
    #client = MongoClient(mongo_host, mongo_port)
    client = MongoClient("mongodb://mongodb:27017")
    db = client['Customers']
    coll_high_value_customers = db['high_value_customers']
except Exception as e:
    logger.error(f"Error connecting to MongoDB: {e}")
    spark.stop()
    raise

In [9]:


def process_high_value_customers(batch_df, batch_id):

    try:
    # Define the frequency mapping
        frequency_mapping = {
          "Weekly": 5,
          "Bi-Weekly": 4,
          "Fortnightly": 4,
          "Monthly": 3,
          "Quarterly": 2,
          "Every 3 Months": 2,
          "Annually": 1
          }
              
        # Create a column for Frequency_Num based on Frequency of Purchases
        batch_df = batch_df.withColumn("Frequency_Num",
                                       when(col("Frequency of Purchases") == "Weekly", 5)
                                       .when(col("Frequency of Purchases") == "Bi-Weekly", 4)
                                       .when(col("Frequency of Purchases") == "Fortnightly", 4)
                                       .when(col("Frequency of Purchases") == "Monthly", 3)
                                       .when(col("Frequency of Purchases") == "Quarterly", 2)
                                       .when(col("Frequency of Purchases") == "Every 3 Months", 2)
                                       .when(col("Frequency of Purchases") == "Annually", 1)
                                       .otherwise(0))
        
        # Filter high-value customers
        high_value_customers = batch_df.filter(
            (col("Frequency_Num") > 1) & 
            (col("Purchase_Amount_USD") > 1) &
            (col("Previous Purchases") > 1)
        )
              
        # Group and aggregate the data
        customer_agg = high_value_customers.groupBy("customer_id").agg(
            spark_sum("Purchase_Amount_USD").alias("TotalPurchaseAmount"),
            spark_sum("Frequency_Num").alias("TotalFrequency"),
            spark_sum("Previous Purchases").alias("TotalPreviousPurchases")
        )
        
        # Calculate a combined score
        customer_agg = customer_agg.withColumn(
            "Combined_Score",
            col("TotalPurchaseAmount") + col("TotalFrequency") + col("TotalPreviousPurchases")
        )
        
        # Sort the data by Combined_Score in descending order and select top 10
        top_10_customers = customer_agg.orderBy(col("Combined_Score").desc()).limit(10)
        
        # Extract top 10 customer IDs
        top_10_customer_ids = [row['customer_id'] for row in top_10_customers.collect()]
        
        # Filter original DataFrame for top 10 customer IDs
        top_customers_df = batch_df.filter(col("customer_id").isin(top_10_customer_ids))
        
        # Analyze their purchasing habits
        habits = top_customers_df.groupBy("Item_Purchased", "Category", "Size", "Color", "Season", "Payment Method", "Shipping Type").agg(
            spark_sum("Purchase_Amount_USD").alias("TotalPurchaseAmount"),
            avg("Review Rating").alias("AverageReviewRating"),
            spark_sum("Discount Applied").alias("TotalDiscountApplied"),
            spark_sum("Previous Purchases").alias("TotalPreviousPurchases")
        ).orderBy(col("TotalPurchaseAmount").desc())
        
        # Convert the habits DataFrame to a list of dictionaries
        habits_list = [row.asDict() for row in habits.collect()]
         # Insert into MongoDB if the list is not empty
        if habits_list:
          coll_high_value_customers.insert_many(habits_list)
          habits.show()
          logger.info(f"Batch {batch_id} processed and inserted into MongoDB")
        else:
          logger.info(f"Batch {batch_id} processed but no high-value customers found")
    # Insert into MongoDB (assuming you have established a connection)
    # Replace 'db.high_value_customer_habits' with your MongoDB collection
   # coll_high_value_customers.insert_many(habits_list)
    except Exception as e:
        logger.error(f"Error processing batch {batch_id}: {e}")
              

In [None]:
# Write the streaming DataFrame to the console using the function
try:
    query_customer = customer_streaming_df.writeStream \
        .outputMode("append") \
        .foreachBatch(process_high_value_customers) \
        .start()
    # Await termination
    query_customer.awaitTermination()
except Exception as e:
    logger.error(f"Error in streaming query: {e}")
finally:
    # Stop Spark session
    spark.stop()


INFO:py4j.java_gateway:Callback Server Starting
INFO:py4j.java_gateway:Socket listening on ('127.0.0.1', 37639)
INFO:py4j.clientserver:Python Server ready to receive messages
INFO:py4j.clientserver:Received command c on object id p0
ERROR:__main__:Error processing batch 0: name 'insights_list' is not defined
