In [2]:
from pyspark.sql import SparkSession
from Data_Reader.reader import Reader


spark = SparkSession.builder\
            .appName('silver_layer')\
            .config("spark.driver.extraClassPath", "./mysql-connector-java-8.0.26.jar")\
            .getOrCreate()

In [3]:
import logging

# clean_path = '''C:\\Users\\yugant.shekhar\\OneDrive - Blue Altair\\Desktop\\Douments\\Spark\\Retail Project\\Data\\actual_data\\cleaned'''
logger = logging.getLogger(__name__)

read = Reader(spark, logger)
customer_mart = read.reader('parquet', 'C:\\Users\\yugant.shekhar\\OneDrive - Blue Altair\\Desktop\\Douments\\Spark\\Retail Project\\Data\\actual_data\\customer_data_mart')

customer_mart.show()

+-----------+----------+------------+--------------------+-------+------------+----------+----------+
|customer_id|first_name|   last_name|             address|pincode|phone_number|sales_date|total_cost|
+-----------+----------+------------+--------------------+-------+------------+----------+----------+
|       1637|     Anita|      Barman|H.No. 88, Upadhya...| 493235|    29562838|2020-10-24|      4543|
|       6317|      Riya|       Bassi|23/159, Parmar Zi...| 469379|           0|2022-01-01|      4301|
|       2420|    Raghav|   Ramaswamy|H.No. 24, Memon P...| 265985|           0|2021-10-07|      3584|
|       7350|    Gautam|       Palla|H.No. 31, Kashyap...| 172013|           0|2020-01-04|      3220|
|       7350|    Gautam|       Palla|H.No. 31, Kashyap...| 172013|           0|2020-01-04|      3220|
|       7350|    Gautam|       Palla|H.No. 31, Kashyap...| 172013|           0|2020-01-04|      3220|
|       2420|    Raghav|   Ramaswamy|H.No. 24, Memon P...| 265985|           0|202

In [5]:
# Full Name

from pyspark.sql import functions as F 

customer_mart = customer_mart.withColumn(
    'full_name', F.concat_ws(' ', F.col('first_name'), F.col('last_name'))
)

customer_mart.show()

+-----------+----------+------------+--------------------+-------+------------+----------+----------+------------------+
|customer_id|first_name|   last_name|             address|pincode|phone_number|sales_date|total_cost|         full_name|
+-----------+----------+------------+--------------------+-------+------------+----------+----------+------------------+
|       1637|     Anita|      Barman|H.No. 88, Upadhya...| 493235|    29562838|2020-10-24|      4543|      Anita Barman|
|       6317|      Riya|       Bassi|23/159, Parmar Zi...| 469379|           0|2022-01-01|      4301|        Riya Bassi|
|       2420|    Raghav|   Ramaswamy|H.No. 24, Memon P...| 265985|           0|2021-10-07|      3584|  Raghav Ramaswamy|
|       7350|    Gautam|       Palla|H.No. 31, Kashyap...| 172013|           0|2020-01-04|      3220|      Gautam Palla|
|       7350|    Gautam|       Palla|H.No. 31, Kashyap...| 172013|           0|2020-01-04|      3220|      Gautam Palla|
|       7350|    Gautam|       P

In [14]:
customer_mart.groupBy('full_name', F.month(F.col('sales_date')).alias('Month'))\
    .agg(F.sum(F.col('total_cost')).alias('Monthly_Purchase'))\
    .distinct().orderBy(F.col('Monthly_Purchase').desc()).show()

+------------------+-----+----------------+
|         full_name|Month|Monthly_Purchase|
+------------------+-----+----------------+
|      Gautam Palla|    5|           30388|
|       Zarna Ratti|    9|           23460|
|        Riya Bassi|    4|           20700|
|        Aachal Rau|    8|           17940|
|Zinal Venkataraman|    5|           16560|
| Balveer Choudhury|   10|           16560|
|     Jagdish Sodhi|    4|           16278|
| Balveer Choudhury|   12|           15180|
|      Gautam Palla|    7|           13800|
|      Jagdish Shan|    3|           12420|
|       Faris Nigam|    6|           12388|
|     Jagdish Sodhi|   10|           11424|
|      Rushil Datta|    2|           11040|
|      Gautam Palla|    1|            9660|
|      Indali Naidu|    9|            9660|
|      Kalpit Divan|    6|            9128|
|        Aadhya Jha|    8|            8512|
|        Aachal Rau|    6|            8440|
|       Faris Nigam|    2|            7847|
|        Aachal Rau|    4|      

In [19]:
from pyspark.sql.window import Window

# Define the window partition
windowSpec = Window.partitionBy('full_name'
                                , F.month(F.col('sales_date'))
                                , F.year(F.col('sales_date'))
                                )

# Use the window to select non-aggregated columns along with your groupBy and aggregation
customer_mart = customer_mart.withColumn('Monthly_Purchase', F.sum('total_cost').over(windowSpec)) \
    .withColumn('Month', F.month(F.col('sales_date')))\
    .withColumn('Year', F.year(F.col('sales_date')))

# Now you can select the non-aggregated columns (like customer_id) and aggregated data
customer_mart.select('customer_id', 'full_name', 'Month', 'Year',\
                     'address', 'phone_number', 'Monthly_Purchase')\
    .distinct().orderBy(F.col('Monthly_Purchase').desc()).show()

+-----------+------------------+-----+----+--------------------+------------+----------------+
|customer_id|         full_name|Month|Year|             address|phone_number|Monthly_Purchase|
+-----------+------------------+-----+----+--------------------+------------+----------------+
|       7350|      Gautam Palla|    5|2023|H.No. 31, Kashyap...|           0|           30388|
|       9456|       Zarna Ratti|    9|2020|13, Balan Street,...|           0|           23460|
|       6317|        Riya Bassi|    4|2021|23/159, Parmar Zi...|           0|           20700|
|       8164|        Aachal Rau|    8|2020|H.No. 52, Nanda C...|           0|           17940|
|       8576|Zinal Venkataraman|    5|2023|832, Majumdar Roa...|           0|           16560|
|       4986| Balveer Choudhury|   10|2020|31/30, Devan Naga...|           0|           16560|
|       4986| Balveer Choudhury|   12|2021|31/30, Devan Naga...|           0|           15180|
|       7350|      Gautam Palla|    7|2023|H.No. 3

## Customer Data Completed


## Starting Sales Team Data

In [59]:
import logging

# clean_path = '''C:\\Users\\yugant.shekhar\\OneDrive - Blue Altair\\Desktop\\Douments\\Spark\\Retail Project\\Data\\actual_data\\cleaned'''
logger = logging.getLogger(__name__)

read = Reader(spark, logger)
sales_mart = read.reader('parquet', 'C:\\Users\\yugant.shekhar\\OneDrive - Blue Altair\\Desktop\\Douments\\Spark\\Retail Project\\Data\\actual_data\\sales_team_data_mart')

sales_mart.show()

+--------+---------------+-----------------------+----------------------+------------------+----------+----------+--------------------+--------------------+----------+----------+-----------+
|store_id|sales_person_id|sales_person_first_name|sales_person_last_name|store_manager_name|manager_id|is_manager|sales_person_address|sales_person_pincode|sales_date|total_cost|sales_month|
+--------+---------------+-----------------------+----------------------+------------------+----------+----------+--------------------+--------------------+----------+----------+-----------+
|     100|             19|                 Nitara|             Kuruvilla|      Arjun Pathak|       8.0|         N|96, Padmanabhan Z...|              729506|2020-10-24|      4543|    2020-10|
|      16|              9|                 Jairaj|             Zachariah|      Chameli Nagi|      null|         Y|H.No. 34, Oommen ...|              304699|2022-01-01|      4301|    2022-01|
|      16|             12|                 Mu

In [42]:
sales_mart.printSchema()

root
 |-- store_id: integer (nullable = true)
 |-- sales_person_id: integer (nullable = true)
 |-- sales_person_first_name: string (nullable = true)
 |-- sales_person_last_name: string (nullable = true)
 |-- store_manager_name: string (nullable = true)
 |-- manager_id: double (nullable = true)
 |-- is_manager: string (nullable = true)
 |-- sales_person_address: string (nullable = true)
 |-- sales_person_pincode: integer (nullable = true)
 |-- sales_date: date (nullable = true)
 |-- total_cost: integer (nullable = true)
 |-- sales_month: string (nullable = true)



In [34]:
# sales_mart.select('store_id', 'sales_month').distinct().show()

In [60]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

sales_mart = sales_mart.withColumn(
    'full_name', F.concat_ws(' ', F.col('sales_person_first_name'), F.col('sales_person_last_name'))
)

windowSpec = Window.partitionBy( 'full_name', 'sales_month')

# Use the window to select non-aggregated columns along with your groupBy and aggregation
sales_mart = sales_mart.withColumn('monthly_sales', F.sum('total_cost').over(windowSpec))

# Define the window to rank salespeople by their monthly sales
rank_window = Window.partitionBy("full_name").orderBy(F.col("monthly_sales").desc())

# Rank the sales and calculate incentives
sales_mart = sales_mart \
    .withColumn("rnk", F.rank().over(rank_window)) \
    .withColumn("incentive", F.when(F.col("rnk") == 1, F.col("monthly_sales") * 0.01).otherwise(F.lit(0))) \
    .withColumn("incentive", F.round(F.col("incentive"), 2)) \
    .select("store_id", "sales_person_id", 'sales_person_address',
             "full_name", "sales_month", "monthly_sales", "incentive", "rnk").distinct()

# Show the result
sales_mart.where(F.col('rnk')==1).drop('rnk').show()


+--------+---------------+--------------------+------------------+-----------+-------------+---------+
|store_id|sales_person_id|sales_person_address|         full_name|sales_month|monthly_sales|incentive|
+--------+---------------+--------------------+------------------+-----------+-------------+---------+
|      16|             10|00/70, Dash Path,...|     Anita Gokhale|    2023-04|        13800|    138.0|
|      43|             18|84/59, Rajan Zila...|        Anvi Verma|    2023-01|         3080|     30.8|
|      96|             14|59/90, Kuruvilla ...|        Arjun Char|    2020-04|         5782|    57.82|
|      63|              8|H.No. 070, Khosla...|    Bachittar Shah|    2021-12|        15180|    151.8|
|      43|             11|H.No. 14, Banik C...|     Dalbir Khatri|    2021-05|         6520|     65.2|
|      63|             15|94, Goda Ganj, Ti...|     Darpan Bhalla|    2020-02|         7847|    78.47|
|      43|              7|H.No. 148, Sampat...|       Dayita Dave|    202

In [44]:
# sales_mart.select("store_id", "sales_month").distinct().show()


In [45]:
# sales_mart.groupBy("store_id", "sales_month")\
#            .agg(F.sum("monthly_sales").alias("total_sales"))\
#            .distinct().show()   


In [48]:
rank_window = Window.partitionBy("full_name", "sales_month").orderBy(F.col("monthly_sales").desc())

sales_mart = sales_mart \
    .withColumn("rnk", F.rank().over(rank_window)) \
    .withColumn("incentive", F.when(F.col("rnk") == 1, F.col("monthly_sales") * 0.01).otherwise(F.lit(0))) \
    .withColumn("incentive", F.round(F.col("incentive"), 2)) \
    .select("store_id", "sales_person_id", 'sales_person_address',
             "full_name", "sales_month", "monthly_sales", "incentive", "rnk").distinct()


# print(sales_mart.show())

# # Now you can select the non-aggregated columns (like customer_id) and aggregated data
# sales_mart.select('store_id', 'sales_person_id', 'full_name', 'sales_month',\
#                      'sales_person_address', 'monthly_sales', 'incentive')\
#     .distinct().orderBy(F.col('Monthly_Sales').desc()).show()

+--------+---------------+------------------+-----------+--------------------+-------------+---------+
|store_id|sales_person_id|         full_name|sales_month|sales_person_address|monthly_sales|incentive|
+--------+---------------+------------------+-----------+--------------------+-------------+---------+
|      93|              3|      Varenya Naik|    2023-05|913, Mall Road, T...|        24840|    248.4|
|      72|             19|  Nitara Kuruvilla|    2020-09|96, Padmanabhan Z...|        23460|    234.6|
|      43|             20|      Lakshmi Kaul|    2021-04|32, Pandey Nagar,...|        20700|    207.0|
|      16|             20|      Lakshmi Kaul|    2020-08|32, Pandey Nagar,...|        17940|    179.4|
|      22|             12|      Mugdha Kohli|    2020-10|64/132, Andra Zil...|        16560|    165.6|
|      72|              4|      Vritti Golla|    2023-05|60, Varghese Circ...|        16560|    165.6|
|      63|              8|    Bachittar Shah|    2021-12|H.No. 070, Khosl