# Chapter 1

### pyspark visualization

```
# Create a pandas dataframe from pyspark dataframe for visualization
sample_df = large_df.sample(withReplacement = False, fraction=0.3, seed=42) # randomly sample from pyspark dataframe
sample_pandas_df = sample_df.toPandas() # Convert to pandas dataframe
sns.histplot(sample_pandas_df[numeric_column], kde=True, color='skyblue') # See distribution plot

# Visualization : Pyspark_dist_explore, pandas (NOT RECOMMENDED), HandySpark(RECOMMENDED)
pandas_df = spark_df.toPandas()
handy_df = spark_df.toHandy() # Convert to handyspark dataframe
handy_df.cols["col_name"].hist()
spark_df = handy_df.to_spark() # Convert to pyspark dataframe
```

# Chapter 2

### Deal with missing values

```
# No of missing values
df.where(df['col_name'].isNull()).count()
# Visualise missing values with heatmap
pandas_df = spark_df.toPandas()
sns.heatmap(data=pandas_df.isnull())
# Drop any records with NULL values
df = df.dropna()
# drop records if both LISTPRICE and SALESCLOSEPRICE are NULL
df = df.dropna(how='all', subset['col1', 'col2 '])
# Drop records where at least two columns have NULL values
df = df.dropna(thresh=2)
# Drop columns with >30% missing values
df = df.drop(*col_list)
# Replace missing values
col_mean = df.agg({'col_name': 'mean'}).collect()[0][0]
df.fillna(col_mean, subset=['col_name'])
# Drop duplicates
df.dropDuplicates(['col_name'])
```

### Scaling

```
max_val = df.agg({'col_name': 'max'}).collect()[0][0]
min_val = df.agg({'col_name': 'min'}).collect()[0][0]
mean_val = df.agg({'col_name': 'mean'}).collect()[0][0]
std_val = df.agg({'col_name': 'stddev'}).collect()[0][0]
df = df.withColumn("min_max_scaled_col", (df['col_name'] - min_val) / (max_val - min_val))
df = df.withColumn("standard_scaled_col", (df['col_name'] - mean_val) / std_val)
df = df.withColumn("feature_scaled_col", df['col_name']  / max_val)
# Log transformation
from pyspark.sql.functions import log
df = df.withColumn('log_col', log(df['col_name']))
```

# Chapter 3

### pySpark Dataframe

```
# Create dataframe from RDD
spark_df = spark.createDataFrame(RDD, schema=colname_list)

# Loading file (folder name will make the spark load all files in that folder in parallel mode)
from pyspark.sql.types import *
dataSchema = StructType([ StructField('col1', StringType(), , nullable=True),
                            StructField('col2', StringType(), , nullable=False)])
df = spark.read.csv("file.csv", header=True, schema=dataSchema, comment='#', sep=',') # .json, .txt, .load for parquet
df = spark.read.format('csv').options(Header=True).load(name='filename.csv') # schema=dataSchema
df.write.parquet('filename.parquet', mode='overwrite') # Save file (parquet is more efficient, binary format for big data)
df.write.format('parquet').save('filename.parquet')
df.show(3) # Show first 3 rows
df.collect() # Store result as list of tuples
df.limit(3) # Same as show
df.dtypes # See datatype of each column
df.printSchema() # See schema information
result.columns # See result table columns
df.filter(~ col('col').isNull()) # Check for nulls
df = df.na.drop(subset=["col_name"]) # Drop nulls
df = df.drop(subset=["col_name"]) # Drop column
df = df.dropDuplicates() # Drop duplicates
df = df.withColumn("col_name", col("col_name").cast("float"))  # Way 1 : Casting a column to another data type
df = df.withColumn("col_name", df.col_name.cast("float")) # Way 2 : Casting a column to another data type
from pyspark.sql.types import IntegerType # Remember: ArrayType is homogeneous, use StructType([]) for heterogeneity
df = df.withColumn('casted_col', df['col_name'].cast(IntegerType())) # Way 3 : Casting 
df.describe().show() # Summary stats
df.agg({'col_name':'max'}).first()[0] # Maximum value of a column
df = df.repartition(4, 'some_col') # create 4 partitions using same column values of specified column
print(df.rdd.getNumPartitions()) # See no of partitions of the dataset
df = df.coalesce(num_partitions) # Reduce the number of partitions = reduce shuffling (distribution of data to the nodes)
df_split = df.withColumn("DataSplit", split(df["Data"], ",")) # Split values and store as list in a column
df_explode = df_split.withColumn("ExplodedData", explode(df_split["DataSplit"])) # Store each element of list in separate rows
df_pivot = df_explode.groupBy("Key").pivot("ExplodedData").count().fillna(0) # pivot count of each element in separate columns 
df = df.select(df.col1, df.col2, df.col3) # way1 : select column from dataframe
df = df.select("col1", "col2") # way2 : select column from dataframe
df.select(col('col1'), col('col2')) # way3 : select column from dataframe,  import col from sql.functions
df = df.withColumn("new_col",df.old_col+10) # Add a new result column
df = df.withColumnRenamed("old_col_name", "new_col_name") # Rename column
df = df.select(col('col1').alias('col1_renamed'), 'col2')
df = df.selectExpr("col1", "col2", "col3", "col1/(col2/60) as another_col")
df = df.withColumn("idx", monotonically_increasing_id()) # Creating id column
df.where(array_contains('col', 'abc')) # Check if an element is inside an array
df1 = df1.withColumn("source", lit("df1")) # Adding constants in a column

df_vertical = df1.union(df2) # Vertical join (append rows vertically)
df_horizontal = df1.join(df1, on=['common_col1', 'common_col2'], how="left") (append columns horizontally with join)
joined_df = df1.join(df2, df1["colx"] == df2["coly"] , how="inner") # Alternative way : Join 
combined_df = df_1.join(broadcast(df_2)) # Prevents undue / excess communication between nodes by giving a nroadcasted copy to each
df_cross = df1.crossJoin(df2) # Cross Join (Horizontally appending columns of possible combinations)

# Filtering (Both produces same results)
df = df.filter("col_name > 120").show()
df = df.where("Value > 120")
df = df.filter(df.col_name > 120).show()
df = df.where(df.Value > 120)
filterA = df.col1 == "SEA"
result = temp.filter(filterA).filter(filterB) # Chaining filters
df.groupBy("col_name").count() # Group by and count
df.orderBy("col_name") # order by 
df.filter(df.col == 'value').groupBy().max("another_col") # Multiple chaining aggregation

df.createOrReplaceTempView("table_name") # Register DataFrame as a temporary talbe in catalog
spark.catalog.listTables() # See all table information in the catalog
spark.catalog.dropTempView('table_name') # Remove temp table from catalog
spark_df = spark.table("table_name") # start using a spark table as spark dataframe
result = spark.sql("SELECT * FROM table_name") # Run query on table

# Using PYSPARK CUSTOM FUNCTION
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
def double_val(col):
    return col * 2 # Make sure any new data is casted to proper type
double_val_udf = udf(double_val, IntegerType()) # Register UDF with custom function and return type
df = df.withColumn("DoubledCol", custom_func(df["col"]))

## Visualization : Pyspark_dist_explore, pandas (NOT RECOMMENDED), HandySpark(RECOMMENDED)
pandas_df = spark_df.toPandas()
handy_df = spark_df.toHandy() # Convert to handyspark dataframe
handy_df.cols["col_name"].hist()
spark_df = handy_df.to_spark() # Convert to pyspark dataframe

## NOTE
# Array: [1.0, 0.0, 0.0, 3.0]
# Sparse vector: (4, [0, 3], [1.0, 3.0])
```

### PySpark Feature engineering

```
# Binarizing (create column with value to 0 or 1)
from pyspark.ml.feature import Binarizer
df = df.withColumn('val', df['val'].cast('double'))
bin = Binarizer(threshold=0.0, inputCol='val', outputCol='binary_col')
df = bin.transform(df)

# Bucketing 
from pyspark.ml.feature import Bucketizer
splits = [0, 1, 2, 3, 4, float('Inf')]
# Create bucketing transformer
buck = Bucketizer(splits=splits, inputCol='BATHSTOTAL', outputCol='baths')
# Apply transformer
df = buck.transform(df)

# One-hot encoding (Can be used with PYSPARK PIPELINE and PYSPARK MACHINE LEARNING model)
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
string_indexer = StringIndexer(inputCol='cat_col', outputCol='Cat_Index') # Map strings to numbers with string indexer
indexed_df = string_indexer.fit(df).transform(df)
encoder = OneHotEncoder(inputCol='Cat_Index', outputCol='Onehot_feature') # Onehot encode indexed values
encoded_df = encoder.fit(indexed_df).transform(indexed_df)

# Using Pipeline to do many steps at once
from pyspark.ml import Pipeline
features_cols = list(df.columns) # Check for non-null columns
features_cols.remove('some_null_col') # Remove the dependent variable from the list
df = df.fillna(-1) # Vector Assembler should not take in any nulls
vec_assembler = VectorAssembler(inputCols=["feature1", "feature2", "Onehot_feature"], outputCol="features") # features_cols
pipeline = Pipeline(stages=[string_indexer, encoder, vec_assembler]) # Last stage is model: eg : stages=[.., model]
pipeline_model = pipeline.fit(df)
transformed_df = pipeline_model.transform(df)

# SPLIT DATA

# Create Model
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(featuresCol="features", labelCol="SALESCLOSEPRICE",
                    predictionCol="Prediction_Price", seed=42 )
model = rf.fit(train_df) # Train model
predictions = model.transform(test_df)
model.save('rfr_model') # Save model
from pyspark.ml.regression import RandomForestRegressionModel
model2 = RandomForestRegressionModel.load('rfr_model') # Load the model
# Evaluate Model
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="SALESCLOSEPRICE", predictionCol="Prediction_Price")
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

# Feature importance
import pandas as pd
# Convert feature importances to a pandas column
importance_df = pd.DataFrame(model.featureImportances.toArray(), columns=['importance'])
importance_df['features'] = pd.Series(feature_cols) # Create a new column to hold feature names
importance_df.sort_values(by=['importance'], ascending=False, inplace=True) # Sort the data based on feature importance
```

# Chapter 4

### pyspark split data

```
########### Splitting non-sequence data
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

########### Splitting sequence data with inherent sequence (eg : Time Series)
# Find how many days our data spans
from pyspark.sql.functions import datediff
range_in_days = datediff(max_date, min_date) # Find the no of days beteen minimum and maximum date
# Find the date to split the dataset on
from pyspark.sql.functions import date_add
split_in_days = round(range_in_days * 0.8) # Find 80% date split point
split_date = date_add(min_date, split_in_days) # Add split point with minimum date to get the split date
# Split the data into 80% train, 20% test
train_df = df.where(df['DATE'] < split_date) # Use filtering with split date to take only training data
test_df = df.where(df['DATE'] >= split_date) # Use filtering with split date to take only testing data
```