# Chapter 1

### Spark dataframe

```
# Create dataframe from RDD
spark_df = spark.createDataFrame(RDD, schema=colname_list)
# Loading file (folder name will make the spark load all files in that folder in parallel mode)
df = spark.read.csv("file.csv", header=True, inferSchema=True) # .json, .txt, .load for parquet
df.show(3)
df.printSchema() # See schema information
result.columns # See result table columns
df.describe().show() # Summary stats
df.createOrReplaceTempView("table_name") # Register DataFrame as a temporary view
result = spark.sql("SELECT * FROM table_name") # Run query on table
spark_df = spark.table("table_name") # start using a spark table as spark dataframe
# Add a new result column
df = df.withColumn("new_col",df.old_col+10)
# Selecting column
df = df.select(df.col1, df.col2, df.col3) # way1
df.select(df.col1, df.col2) # way2
from pyspark.sql.functions import col # way3
df.select(col('col1'), col('col2'))
calculated_col = (df.col1/(df.col2/60)).alias("another_col")
df = df.select("col1", "col2", "col3", calculated_col)
df = df.selectExpr("col1", "col2", "col3", "col1/(col2/60) as another_col")
df = df.select(col('col1').alias('col1_renamed'), 'col2')
# Filtering (Both produces same results)
df.filter("col_name > 120").show()
df.filter(df.col_name > 120).show()
# Chaining filters
filterA = df.col1 == "SEA"
filterB = df.col2 == "PDX"
result = temp.filter(filterA).filter(filterB)
df = df.withColumn("idx", monotonically_increasing_id()) # Creating id column
df.groupBy("col_name").count().show() # Group by and count
df.orderBy("col_name").show(3) # order by and count
# Aggregation
df.filter(df.col == 'value').groupBy().max("another_col").show()
df = df.na.drop(subset=["col_name"]) # Drop nulls
df = df.dropDuplicates() # Drop duplicates
# Rename column
df = df.withColumnRenamed("old_col_name", "new_col_name")

# Casting / Converting column type
from pyspark.sql.functions import col
df = df.withColumn("col_name", col("col_name").cast("float"))
df = df.withColumn("col_name", df.col_name.cast("float"))
# Repartitioning based on similar values on a column for distributed computing
df = df.repartition(4, 'some_col') # create 4 partitions
print(df.rdd.getNumPartitions())
# SQL with dataframe
df.createOrReplaceTempView("table_name")
df2 = spark.sql("SELECT * FROM table_name")
result = df2.collect() # Dataframe as list of rows tha you can iterate over

## Visualization : Pyspark_dist_explore, pandas (NOT RECOMMENDED), HandySpark(RECOMMENDED)
pandas_df = spark_df.toPandas()
handy_df = spark_df.toHandy() # Convert to handyspark dataframe
handy_df.cols["col_name"].hist()
spark_df = handy_df.to_spark() # Convert to pyspark dataframe
```

### Spark SQL

```
df = spark.read.csv("filename.csv", header=True)
df.createOrReplaceTempView("table_name")
result = spark.sql("SELECT * FROM table_name") # simple query, result saved as dataframe
result.show()
result = spark.sql("DESCRIBE tablename") # See table information

# Window functions
query = """
SELECT *,
ROW_NUMBER() OVER(PARTITION BY train_id ORDER BY time) AS id
FROM schedule
"""
spark.sql(query)

# equivalent dot notation
window = Window.partitionBy('train_id').orderBy('time')
dfx = df.withColumn('id', row_number().over(window))

# CASE.. WHEN
query = """
SELECT id,
    CASE
        WHEN id < 25000 THEN 'Preface'
        WHEN id < 50000 THEN 'Chapter 1'
        WHEN id < 75000 THEN 'Chapter 2'
        ELSE 'Chapter 3'
    END AS title
FROM df
"""
spark.sql(query)

# equivalent dot notation
df2 = df.withColumn('title', when(df.id < 25000, 'Preface')
.when(df.id < 50000, 'Chapter 1')
.when(df.id < 75000, 'Chapter 2')
.otherwise('Chapter 3'))

```

# Chapter 2

### Spark regex

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_extract, regexp_replace

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Regex Example") \
    .getOrCreate()

# Sample data
data = [("John", "Contact me at (123) 456-7890 or (456) 789 0123"),
        ("Alice", "No phone number in this text"),
        ("Bob", "Call me at (555) 555-5555 or (555) 123-4567")]

# Create DataFrame
df = spark.createDataFrame(data, ["name", "text"])
# Pattern Matching: Check if each string contains a phone number in the specified format
df.createOrReplaceTempView("df")
df = df.withColumn("has_phone_number", df["text"].rlike(r'\(\d{3}\) \d{3}[- ]\d{4}'))

# Group Matching: Extract area code, exchange code, and subscriber number
phone_pattern = r'\((\d{3})\) (\d{3})[- ](\d{4})'
df = df.withColumn("area_code", regexp_extract("text", phone_pattern, 1))
df = df.withColumn("exchange_code", regexp_extract("text", phone_pattern, 2))
df = df.withColumn("subscriber_number", regexp_extract("text", phone_pattern, 3))

# Replace: Replace phone numbers with "PHONE_NUMBER_REDACTED"
df = df.withColumn("redacted_text", regexp_replace("text", phone_pattern, "PHONE_NUMBER_REDACTED"))

# Show DataFrame
# df.show() # truncate=False

# Stop SparkSession
spark.stop()


### Pyspark string

```
# lowercase the strings in a column
df = df.select(lower(col('col_name'))) 
# replace string or characters
df = df1.select(regexp_replace('col_name', 'old', 'new').alias('new_col'))
# Split a string on space
df = df.select(split('string_col', '[ ]').alias('word_list'))
# Split string using any given symbol
punctuation = "_|.\?\!\",\'\[\]\*()"
df = df.select(split('string_col', '[ %s]' % punctuation).alias('word_list'))
# Filter out empty strings from the resulting list
df = df.filter(col('word_list') != '')
# Explode the string list column so that each row contains one value of list
df = df.select(explode('word_list').alias('word'))

```