In [1]:
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.functions import *


In [13]:
spark = SparkSession.builder.appName("q2").getOrCreate()
filepath = "Sales Data.csv"
df = spark.read.csv(filepath, inferSchema=True , header=True)
df.printSchema()
# df.show()

root
 |-- _c0: integer (nullable = true)
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Sales: double (nullable = true)
 |-- City: string (nullable = true)
 |-- Hour: integer (nullable = true)



In [3]:
#checking for null values
df.select([count(when(isnull(c) | isnan(c),c)).alias(c) for c in df.columns]).show()

+---+--------+-------+----------------+----------+----------+----------------+-----+-----+----+----+
|_c0|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|Month|Sales|City|Hour|
+---+--------+-------+----------------+----------+----------+----------------+-----+-----+----+----+
|  0|       0|      0|               0|         0|         0|               0|    0|    0|   0|   0|
+---+--------+-------+----------------+----------+----------+----------------+-----+-----+----+----+



In [6]:
# 1. Use PySpark to identify columns with missing values and decide whether to fill or 
# remove them. For numerical columns like Sales or Quantity Ordered, you may want 
# to fill missing values with the mean or median, or drop rows with missing values.

### important to put df = , for everything
mean_value = df.select(mean(col('Sales'))).first()[0]
df = df.fillna({'Sales':mean_value})

critical_columns = ["Order Id","Product", "Order Date"]
df = df.dropna(subset = critical_columns)

Row(avg(Sales)=185.49375999620375)


In [7]:
#2
df = df.dropDuplicates()

In [9]:
#3 ensure numerical columsn are correctly formatted
df = df.withColumn("Quantity Ordered", col("Quantity Ordered").cast("double")).withColumn("Price Each",col("Price Each").cast("double"))
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: double (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Sales: double (nullable = true)
 |-- City: string (nullable = true)
 |-- Hour: integer (nullable = true)



In [10]:
#4
# Check for negative values in the Sales, Price Each, and Quantity Ordered columns, as 
# these columns should not contain negative numbers (since they represent sales 
# amounts, prices, or quantities that should logically be non-negative). Remove any 
# rows where these columns contain negative values to ensure the dataset is clean and 
# suitable for analysis. 

df= df.filter((col("Sales") >=0 ) & (col("Price Each")>=0) & (col("Quantity Ordered")>=0))

In [11]:
df.count()

185950

In [14]:
#5 total sales for each product using sum and groupBy

ans = df.groupBy("Product").agg(sum(col("Sales")).alias("Total Sales"))
ans.show()

+--------------------+------------------+
|             Product|       Total Sales|
+--------------------+------------------+
|    Wired Headphones|246651.93375999495|
|  Macbook Pro Laptop|         8037600.0|
|Apple Airpods Hea...|         2349150.0|
|              iPhone|         4794300.0|
|Lightning Chargin...|347094.14999997814|
|Bose SoundSport H...|1345565.4299999585|
|USB-C Charging Cable|286674.79375998117|
|AAA Batteries (4-...| 92740.82999999724|
|        20in Monitor|454148.71000000305|
|    27in FHD Monitor| 1132424.499999984|
|     Vareebadd Phone|          827200.0|
|34in Ultrawide Mo...| 2355558.009999966|
|            LG Dryer|          387600.0|
|AA Batteries (4-p...|106300.05375999937|
|        Google Phone|         3319200.0|
|       Flatscreen TV|         1445700.0|
|  LG Washing Machine|          399600.0|
|27in 4K Gaming Mo...|2435097.5599999656|
|     ThinkPad Laptop| 4129958.699999971|
+--------------------+------------------+



In [None]:
df_pandas = df.toPandas()
df_pandas.to_csv(output_path,index=False)