In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
import pandas as pd

In [3]:
spark = SparkSession.builder.appName("q2").getOrCreate()
file_path = "Sales data.csv"
df = spark.read.csv(file_path,header=True,inferSchema=True)
df.printSchema()
df.show()

root
 |-- _c0: integer (nullable = true)
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Sales: double (nullable = true)
 |-- City: string (nullable = true)
 |-- Hour: integer (nullable = true)

+----+--------+--------------------+----------------+----------+----------------+--------------------+-----+------+--------------+----+
| _c0|Order ID|             Product|Quantity Ordered|Price Each|      Order Date|    Purchase Address|Month| Sales|          City|Hour|
+----+--------+--------------------+----------------+----------+----------------+--------------------+-----+------+--------------+----+
| 297|  295941|     ThinkPad Laptop|               1|    999.99|31-12-2019 16:24|64 Dogwood St, Po...|   12|999.99|      Portland|  16|
| 464

In [4]:
#Verify that all columns are properly loaded, and check for any unexpected null or 
# malformed entries during loading. 
missing_values = df.select([count(when(isnull(c) | isnan(c) , c)).alias(c) for c in df.columns])
missing_values.show()

+---+--------+-------+----------------+----------+----------+----------------+-----+-----+----+----+
|_c0|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|Month|Sales|City|Hour|
+---+--------+-------+----------------+----------+----------+----------------+-----+-----+----+----+
|  0|       0|      0|               0|         0|         0|               0|    0|    0|   0|   0|
+---+--------+-------+----------------+----------+----------+----------------+-----+-----+----+----+



In [1]:
# 1. Use PySpark to identify columns with missing values and decide whether to fill or 
# remove them. For numerical columns like Sales or Quantity Ordered, you may want 
# to fill missing values with the mean or median, or drop rows with missing values.

# numerical_columns = ["Sales"]
# for column in numerical_columns:
mean_value = df.select(mean(col('Sales'))).first()[0]
df = df.fillna({'Sales':mean_value})

critical_columns = ["Order ID", "Product", "Order Date"]
df = df.dropna(subset=critical_columns)


NameError: name 'df' is not defined

In [17]:
df.count()

185950

In [16]:
df = df.dropDuplicates()

In [18]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Sales: double (nullable = false)
 |-- City: string (nullable = true)
 |-- Hour: integer (nullable = true)



In [8]:
df = df.withColumn("Quantity Ordered", col("Quantity Ordered").cast("integer")).withColumn("Price Each", col("Price Each").cast("double")).withColumn("Sales", col("Sales").cast("double"))
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Sales: double (nullable = true)
 |-- City: string (nullable = true)
 |-- Hour: integer (nullable = true)



In [10]:
df = df.filter((col("Sales") >= 0) & (col("Price Each") >= 0) & (col("Quantity Ordered") >= 0))

In [21]:
df.count()

185950

In [24]:
total_sales_by_product = df.groupBy("Product").agg(sum("Sales").alias("Total Sales"))
total_sales_by_product.show()

+--------------------+------------------+
|             Product|       Total Sales|
+--------------------+------------------+
|    Wired Headphones|246651.93375999868|
|  Macbook Pro Laptop|         8037600.0|
|Apple Airpods Hea...|         2349150.0|
|              iPhone|         4794300.0|
|Lightning Chargin...|347094.15000000864|
|Bose SoundSport H...|1345565.4300000193|
|USB-C Charging Cable|286674.79376000515|
|AAA Batteries (4-...| 92740.82999999696|
|        20in Monitor|454148.71000000136|
|    27in FHD Monitor|1132424.5000000084|
|     Vareebadd Phone|          827200.0|
|34in Ultrawide Mo...|  2355558.00999999|
|            LG Dryer|          387600.0|
|AA Batteries (4-p...| 106300.0537599985|
|        Google Phone|         3319200.0|
|       Flatscreen TV|         1445700.0|
|  LG Washing Machine|          399600.0|
|27in 4K Gaming Mo...| 2435097.559999989|
|     ThinkPad Laptop|4129958.6999999797|
+--------------------+------------------+



In [26]:
output_path= "Sales data.csv"
pandas_df = df.toPandas()
pandas_df.to_csv(output_path, index=False)

In [35]:
print(pandas_df.isnull().sum())

_c0                 0
Order ID            0
Product             0
Quantity Ordered    0
Price Each          0
Order Date          0
Purchase Address    0
Month               0
Sales               0
City                0
Hour                0
dtype: int64
