# Chapter 1

### Spark dataframe

```
# Create dataframe from RDD
spark_df = spark.createDataFrame(RDD, schema=colname_list)
# Loading file
df = spark.read.csv("file.csv", header=True, inferSchema=True) # .json, .txt
df.show(3)
df.printSchema() # See schema information
result.columns # See result table columns
df.describe().show() # Summary stats
df.createOrReplaceTempView("table_name") # Register DataFrame as a temporary view
result = spark.sql("SELECT * FROM table_name") # Run query on table
spark_df = spark.table("table_name") # start using a spark table as spark dataframe
# Add a new result column
df = df.withColumn("new_col",df.old_col+10)
# Selecting column
df = df.select(df.col1, df.col2, df.col3) # way1
df.select(df.col1, df.col2) # way2
from pyspark.sql.functions import col # way3
df.select(col('col1'), col('col2'))
calculated_col = (df.col1/(df.col2/60)).alias("another_col")
df = df.select("col1", "col2", "col3", calculated_col)
df = df.selectExpr("col1", "col2", "col3", "col1/(col2/60) as another_col")
df = df.select(col('col1').alias('col1_renamed'), 'col2')
# Filtering (Both produces same results)
df.filter("col_name > 120").show()
df.filter(df.col_name > 120).show()
# Chaining filters
filterA = df.col1 == "SEA"
filterB = df.col2 == "PDX"
result = temp.filter(filterA).filter(filterB)

df.groupBy("col_name").count().show() # Group by and count
df.orderBy("col_name").show(3) # order by and count
# Aggregation
df.filter(df.col == 'value').groupBy().max("another_col").show()
df = df.na.drop(subset=["col_name"]) # Drop nulls
df = df.dropDuplicates() # Drop duplicates
# Rename column
df = df.withColumnRenamed("old_col_name", "new_col_name")

# Casting / Converting column type
from pyspark.sql.functions import col
df = df.withColumn("col_name", col("col_name").cast("float"))
df = df.withColumn("col_name", df.col_name.cast("float"))

# SQL with dataframe
df.createOrReplaceTempView("table_name")
df2 = spark.sql("SELECT * FROM table_name")
result = df2.collect() # Dataframe as list of rows tha you can iterate over

## Visualization : Pyspark_dist_explore, pandas (NOT RECOMMENDED), HandySpark(RECOMMENDED)
pandas_df = spark_df.toPandas()
handy_df = spark_df.toHandy() # Convert to handyspark dataframe
handy_df.cols["col_name"].hist()
spark_df = handy_df.to_spark() # Convert to pyspark dataframe
```

### Spark SQL

```
df = spark.read.csv("filename.csv", header=True)
df.createOrReplaceTempView("table_name")
result = spark.sql("SELECT * FROM table_name") # simple query, result saved as dataframe
result.show()
result = spark.sql("DESCRIBE tablename") # See table information

# Window functions
query = """
SELECT *,
ROW_NUMBER() OVER(PARTITION BY train_id ORDER BY time) AS id
FROM schedule
"""
spark.sql(query)
# equivalent dot notation
window = Window.partitionBy('train_id').orderBy('time')
dfx = df.withColumn('id', row_number().over(window))


```