In [3]:
# ───────────────────────────────────────────────
#          PySpark CSV + Common Operations
#           (2025–2026 style - most used patterns)
# ───────────────────────────────────────────────

# === 1. Basic session creation (Spark 3.x ) =============================
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("CSV_cheatsheet_example") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

# sc = spark.sparkContext

# === 2. Reading CSV files ===============================================
df = spark.read.csv("path/to/file.csv", header=True, inferSchema=True)
# or more explicit / safer version:
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("encoding", "UTF-8") \
    .option("escape", '"') \
    .option("quote", '"') \
    .option("mode", "PERMISSIVE") \
    .csv("data/*.csv")

# very common modern pattern (especially 2024–2026):
df = spark.read.option("header",True).csv("s3://bucket/folder/*.csv.gz")

# === 3. Quick inspection ================================================
df.printSchema()                                             # shows column names + inferred types
df.show(5, truncate=40, vertical=False)                      # vertical=True is great for wide tables
df.select("*").limit(20).toPandas()                          # careful – only for small results!

# get row count (action!)
df.count()

# === 4. Most useful column operations ===================================
from pyspark.sql.functions import col, column

# Select & rename
df.select("id", "name", col("salary").alias("monthly_salary"))

# Filter (two equivalent styles)
df.filter("age > 30 AND salary < 80000")
df.filter((col("age") > 30) & (col("salary") < 80000))

# Add / replace column
from pyspark.sql.functions import lit, when, concat_ws, lower, upper

df = df.withColumn("country", lit("Kenya"))                  # constant value
df = df.withColumn("senior", when(col("age") >= 35, True).otherwise(False))
df = df.withColumn("full_name", concat_ws(" ", "first_name", "last_name"))
df = df.withColumn("email_lower", lower(col("email")))

# === 5. Handling nulls / missing values ================================
from pyspark.sql.functions import coalesce, isnan, when, count

df = df.na.fill({"salary": 0, "age": -1})                    # fill nulls with specific values
df = df.na.drop("any")                                       # drop row if ANY column is null
df = df.na.drop("all", subset=["email", "phone"])            # drop only if ALL listed cols null

# count nulls per column (very useful pattern)
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

# === 6. GroupBy + Aggregations (most frequent task) ====================
from pyspark.sql.functions import count, sum, avg, min, max, countDistinct

result = df.groupBy("department", "city") \
    .agg(
        count("*").alias("headcount"),
        countDistinct("employee_id").alias("unique_employees"),
        sum("salary").alias("total_salary"),
        avg("salary").alias("avg_salary"),
        max("age").alias("oldest")
    ) \
    .orderBy("total_salary", ascending=False)

result.show(truncate=False)

# withColumn + agg pattern (very common)
from pyspark.sql.functions import round

df.groupBy("department").agg(
    round(avg("salary"), 0).alias("avg_salary_round"),
    (sum("salary") / 1000000).alias("salary_millions")
).orderBy("avg_salary_round", ascending=False)

# === 7. Joins (most common types) =======================================
orders = spark.read.csv("orders.csv", header=True, inferSchema=True)
customers = spark.read.csv("customers.csv", header=True, inferSchema=True)

# inner (default)
df_joined = orders.join(customers, "customer_id", "inner")

# left / right / full / anti / cross
df_left  = orders.join(customers, "customer_id", "left")
df_anti  = orders.join(customers, "customer_id", "left_anti")   # rows in orders without match

# multi-column join
# df.join(other_df,
#         (df.customer_id == other_df.id) & (df.country == other_df.country),
#         "left")

# === 8. Window functions (ranking, running totals, etc) ================
from pyspark.sql.window import Window
import pyspark.sql.functions as F

window_spec = Window.partitionBy("department").orderBy(F.desc("salary"))

df_with_rank = df.withColumn("rank", F.rank().over(window_spec)) \
                 .withColumn("dense_rank", F.dense_rank().over(window_spec)) \
                 .withColumn("row_number", F.row_number().over(window_spec))

# running total example
window_cum = Window.partitionBy("department").orderBy("hire_date").rowsBetween(Window.unboundedPreceding, Window.currentRow)

df = df.withColumn("cumulative_salary", F.sum("salary").over(window_cum))

# === 9. Writing results =================================================
# most common formats
df.write.mode("overwrite").parquet("s3://bucket/results/employees.parquet/")
df.write.mode("append").partitionBy("year","month").parquet("output/")

# CSV output (less common in big data, but still used)
df.write \
    .mode("overwrite") \
    .option("header", "true") \
    .option("compression", "gzip") \
    .csv("output/my_result_csv/")

# single file output (small data only!)
df.coalesce(1).write.mode("overwrite").csv("small_result/", header=True)

# === 10. Quick one-liners you use all the time ========================
df.cache()                              # or .persist() — very important for iterative work
df.unpersist()                          # free memory

df.createOrReplaceTempView("employees") # then use SQL
spark.sql("SELECT department, AVG(salary) FROM employees GROUP BY department")

df.explain()                            # see physical + logical plan
df.explain("extended")                  # more detailed

# Stop session when finished
spark.stop()                          # usually done automatically in notebooks

Unnamed: 0,_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8
0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
1,-122.050000,37.370000,27.000000,3885.000000,661.000000,1537.000000,606.000000,6.608500,344700.000000
2,-118.300000,34.260000,43.000000,1510.000000,310.000000,809.000000,277.000000,3.599000,176500.000000
3,-117.810000,33.780000,27.000000,3589.000000,507.000000,1484.000000,495.000000,5.793400,270500.000000
4,-118.360000,33.820000,28.000000,67.000000,15.000000,49.000000,11.000000,6.135900,330000.000000
