In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum

# Optimized Spark session with logging disabled
spark = SparkSession.builder     .appName("OptimizedGlobalSales")     .master("local[*]")     .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")


In [3]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving sample_sales_data.csv to sample_sales_data.csv
User uploaded file "sample_sales_data.csv" with length 3927 bytes


In [5]:
import os

# Directory where CSVs are located
data_dir = "/content/"

# Read all CSV files into a unified DataFrame
df = spark.read.option("header", True).csv(os.path.join(data_dir, "*.csv"))
print(f"Total records: {df.count()}")
df.show(5)

Total records: 150
+-------+-------------+--------+
|Country|    Continent| Revenue|
+-------+-------------+--------+
|  Kenya|       Africa|  9205.0|
| Mexico|North America|27471.51|
|  Kenya|       Africa|25485.73|
|  Egypt|       Africa|46383.55|
| Brazil|South America| 8608.24|
+-------+-------------+--------+
only showing top 5 rows



In [6]:
df_clean = df.withColumn("Revenue", col("Revenue").cast("float"))              .dropna(subset=["Country", "Continent", "Revenue"])

print(f"Cleaned records: {df_clean.count()}")
df_clean.show(5)


Cleaned records: 150
+-------+-------------+--------+
|Country|    Continent| Revenue|
+-------+-------------+--------+
|  Kenya|       Africa|  9205.0|
| Mexico|North America|27471.51|
|  Kenya|       Africa|25485.73|
|  Egypt|       Africa|46383.55|
| Brazil|South America| 8608.24|
+-------+-------------+--------+
only showing top 5 rows



In [7]:
continent_summary = df_clean.groupBy("Continent")                             .agg(spark_sum("Revenue").alias("Total_Revenue"))                             .orderBy("Continent")

continent_summary.show()

+-------------+-----------------+
|    Continent|    Total_Revenue|
+-------------+-----------------+
|       Africa|  438301.27734375|
|         Asia|751389.8803710938|
|       Europe|495253.0085449219|
|North America|610309.3798828125|
|      Oceania|553106.0433349609|
|South America|   503744.8828125|
+-------------+-----------------+



In [8]:
spark.stop()