# Chapter 1

### spark session

```
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
	.master('local[*]') \ # Location of cluster, use all cores of local computer
    .appName("Load and Query CSV with SQL") \
    .getOrCreate()
# Define schema
schema = StructType([
StructField("col1", StringType()),
StructField("col2", IntegerType()),
StructField("col3", DoubleType())
])
# Load the CSV file into a DataFrame
df = spark.read.csv("file.csv",sep=',', header=True, inferSchema=True, nullValue='NA') # schema= schema
# Check column types
df.printSchema()
df.dtypes
# Register the DataFrame as a temporary table or view
df.createOrReplaceTempView("my_table")
# Print the tables in the catalog
print(spark.catalog.listTables())
# Run SQL queries on the DataFrame
query_result = spark.sql("SELECT * FROM my_table WHERE column_name = 'value'")
query_result.show()

sc = spark.sparkContext # Access the SparkContext from SparkSession
spark = SparkSession(sc) # Create a SparkSession from SparkContext
spark.stop() # Stop SparkSession
```

# Chapter 2

### String Manipulation

```
# lowercase the strings in a column
df = df.select(lower(col('col_name'))) 
# replace string or characters
df = df1.select(regexp_replace('col_name', 'old', 'new').alias('new_col'))
# Split a string on space
df = df.select(split('string_col', '[ ]').alias('word_list'))
# Split string using any given symbol
punctuation = "_|.\?\!\",\'\[\]\*()"
df = df.select(split('string_col', '[ %s]' % punctuation).alias('word_list'))
# Filter out empty strings from the resulting list
df = df.filter(col('word_list') != '')
# Explode the string list column so that each row contains one value of list
df = df.select(explode('word_list').alias('word'))
pivot_df = df.groupBy('col1', 'col2').pivot('word').count()

### dealing with NLP related features
# replace unwanted characters
from pyspark.sql.functions import regexp_replace
REGEX = '[,\\-]'
df = df.withColumn('text', regexp_replace(df.text, REGEX, ' '))
# Tokenize words
from pyspark.ml.feature import Tokenizer
df = Tokenizer(inputCol="text", outputCol="tokens").transform(df)
from pyspark.ml.feature import StopWordsRemover
stopwords = StopWordsRemover(inputCol='tokens', outputCol='words')
stopwords.getStopWords() # Take a look at the list of stop words when stopwords = StopWordsRemover()
df = stopwords.transform(df)
# Hash the features
from pyspark.ml.feature import HashingTF
hasher = HashingTF(inputCol="words", outputCol="hash", numFeatures=32)
df = hasher.transform(df)
# Normalize the text features (TF-IDF)
from pyspark.ml.feature import IDF
df = IDF(inputCol="hash", outputCol="features").fit(df).transform(df)

```

# Chapter 3

### Feature Engineering

```
# Binarizing (create column with value to 0 or 1)
from pyspark.ml.feature import Binarizer
df = df.withColumn('val', df['val'].cast('double'))
bin = Binarizer(threshold=0.0, inputCol='val', outputCol='binary_col')
df = bin.transform(df)

# Bucketing 
from pyspark.ml.feature import Bucketizer
splits = [0, 1, 2, 3, 4, float('Inf')]
# Create bucketing transformer
buck = Bucketizer(splits=splits, inputCol='BATHSTOTAL', outputCol='baths')
# Apply transformer
df = buck.transform(df)

# One-hot encoding (Can be used with PYSPARK PIPELINE and PYSPARK MACHINE LEARNING model)
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
string_indexer = StringIndexer(inputCol='cat_col', outputCol='Cat_Index') # Map strings to numbers with string indexer
indexed_df = string_indexer.fit(df).transform(df)
encoder = OneHotEncoder(inputCol='Cat_Index', outputCol='Onehot_feature') # Onehot encode indexed values
encoded_df = encoder.fit(indexed_df).transform(indexed_df)

# Using Pipeline to do many steps at once
from pyspark.ml import Pipeline
features_cols = list(df.columns) # Check for non-null columns
features_cols.remove('some_null_col') # Remove the dependent variable from the list
df = df.fillna(-1) # Vector Assembler should not take in any nulls
vec_assembler = VectorAssembler(inputCols=["feature1", "feature2", "Onehot_feature"], outputCol="features") # features_cols
pipeline = Pipeline(stages=[string_indexer, encoder, vec_assembler]) # Last stage is model: eg : stages=[.., model]
pipeline_model = pipeline.fit(df)
transformed_df = pipeline_model.transform(df)

# SPLIT DATA

# Create Model
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(featuresCol="features", labelCol="SALESCLOSEPRICE",
                    predictionCol="Prediction_Price", seed=42 )
model = rf.fit(train_df) # Train model
predictions = model.transform(test_df)
model.save('rfr_model') # Save model
from pyspark.ml.regression import RandomForestRegressionModel
model2 = RandomForestRegressionModel.load('rfr_model') # Load the model
# Evaluate Model
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="SALESCLOSEPRICE", predictionCol="Prediction_Price")
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

# Feature importance
import pandas as pd
# Convert feature importances to a pandas column
importance_df = pd.DataFrame(model.featureImportances.toArray(), columns=['importance'])
importance_df['features'] = pd.Series(feature_cols) # Create a new column to hold feature names
importance_df.sort_values(by=['importance'], ascending=False, inplace=True) # Sort the data based on feature importance
```

### Machine Learning

```
# One-hot encoding
from pyspark.ml.feature import StringIndexer, OneHotEncoder
# StringIndexer does indexing for each category. this step allows handling unseen category in testing set
string_indexer1 = StringIndexer(inputCol="cat_col",outputCol="string_index") 
one_hot_encoder1 = OneHotEncoder(inputCol="string_index",outputCol="onehot_feature") # One-hot encoding using the category indices
# Combine all features
from pyspark.ml.feature import VectorAssembler
vec_assembler = VectorAssembler(inputCols=["feature1", "feature2", "feature3", "onehot_feature1", "onehot_feature2"], outputCol="features")

# Define the model
model_rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=10) # from pyspark.ml.classification import RandomForestClassifier
model_lr1 = LogisticRegression(featuresCol="features", labelCol="label") # from pyspark.ml.classification import LogisticRegression
# elasticNetParam = 0 is ridge regression, elasticNetParam = 1 is lasso regression (regularization parameters)
model_lr2 = LinearRegression(featuresCol="features", labelCol="label", elasticNetParam=0, regParam=0.1) # from pyspark.ml.regression import LinearRegression
model_kmeans = KMeans(featuresCol="features", predictionCol="kmeans_prediction", k=3) # from pyspark.ml.clustering import KMeans
# deep learninng
layers = [len(feature_cols) + 2, 5, 2]  # Input layer size, hidden layer sizes, output layer size
model_dl = MultilayerPerceptronClassifier(layers=layers, labelCol="label", featuresCol="features", seed=123)
# Pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[string_indexer1, one_hot_encoder1, string_indexer2, one_hot_encoder2, vec_assembler, model_xx])

# Create the parameter grid
import pyspark.ml.tuning as tune
paramGrid = tune.ParamGridBuilder()\
        .addGrid(lr.regParam, np.arange(0, .1, .01))
        .addGrid(lr.elasticNetParam, [0, 1])
        .build()

# Evaluation metric
import pyspark.ml.evaluation as evals
evaluator_logistic = evals.BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction")
evaluator_reg = evals.RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator_rf_dl = evals.MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") 
# Create cross-validator
cv = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3) 

# Split the data into training and test sets
training, test = transformed_df.randomSplit([.6, .4])

cvModel = cv.fit(training) # Fit the dataframe
bestModel = cvModel.bestModel # Best model
bestParams = bestModel.stages[-1].extractParamMap() # See best parameters
test_results = bestModel.transform(test) # Use the model to predict the test set
predictions = cvModel.transform(test) # Predict using testing set
accuracy = evaluator.evaluate(predictions)
test_results.groupBy("label", "prediction").count().show() # Confusion matrix
feature_importances = bestModel.stages[-1].featureImportances
```