# Chapter 1

### spark session

```
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
	.master('local[*]') \ # Location of cluster, use all cores of local computer
    .appName("Load and Query CSV with SQL") \
    .getOrCreate()
# Define schema
schema = StructType([
StructField("col1", StringType()),
StructField("col2", IntegerType()),
StructField("col3", DoubleType())
])
# Load the CSV file into a DataFrame
df = spark.read.csv("file.csv",sep=',', header=True, inferSchema=True, nullValue='NA') # schema= schema
# Check column types
df.printSchema()
df.dtypes
# Register the DataFrame as a temporary table or view
df.createOrReplaceTempView("my_table")
# Print the tables in the catalog
print(spark.catalog.listTables())
# Run SQL queries on the DataFrame
query_result = spark.sql("SELECT * FROM my_table WHERE column_name = 'value'")
query_result.show()

sc = spark.sparkContext # Access the SparkContext from SparkSession
spark = SparkSession(sc) # Create a SparkSession from SparkContext
spark.stop() # Stop SparkSession
```

# Chapter 2

```
# lowercase the strings in a column
df = df.select(lower(col('col_name'))) 
# replace string or characters
df = df1.select(regexp_replace('col_name', 'old', 'new').alias('new_col'))
# Split a string on space
df = df.select(split('string_col', '[ ]').alias('word_list'))
# Split string using any given symbol
punctuation = "_|.\?\!\",\'\[\]\*()"
df = df.select(split('string_col', '[ %s]' % punctuation).alias('word_list'))
# Filter out empty strings from the resulting list
df = df.filter(col('word_list') != '')
# Explode the string list column so that each row contains one value of list
df = df.select(explode('word_list').alias('word'))

### dealing with NLP related features
# replace unwanted characters
from pyspark.sql.functions import regexp_replace
REGEX = '[,\\-]'
df = df.withColumn('text', regexp_replace(df.text, REGEX, ' '))
# Tokenize words
from pyspark.ml.feature import Tokenizer
df = Tokenizer(inputCol="text", outputCol="tokens").transform(df)
from pyspark.ml.feature import StopWordsRemover
stopwords = StopWordsRemover(inputCol='tokens', outputCol='words')
stopwords.getStopWords() # Take a look at the list of stop words when stopwords = StopWordsRemover()
df = stopwords.transform(df)
# Hash the features
from pyspark.ml.feature import HashingTF
hasher = HashingTF(inputCol="words", outputCol="hash", numFeatures=32)
df = hasher.transform(df)
# Normalize the text features (TF-IDF)
from pyspark.ml.feature import IDF
df = IDF(inputCol="hash", outputCol="features").fit(df).transform(df)

```