In [1]:
import pyspark
print(pyspark.__version__)

2.4.3


sub-modules

* Structured Data - __pyspark.sql__
* Streaming Data - __pyspark.streaming__
* Machine Learning - __pyspark.ml__  (__pyspark.mllib__ (deprecated))

Spark URL

* remote cluster 
    * spark://13.59.151.161:7077
* local cluster
    * local - 1 core
    * local[4] - 4 cores
    * local[*] - all available cores
    
Spark Session
`from pyspark.sql import SparkSession`


In [None]:
# Import the PySpark module
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession\
    .builder\
    .master('local[*]')\
    .appName('sparkApp1')\
    .getOrCreate()

place multi-line command inside parenthesis without using continuation symbol `\`

In [6]:
spark = (
    SparkSession
    .builder
    .master('local[*]')
    .appName('sparkApp1')
    .getOrCreate()
)

In [7]:
spark

In [8]:
# What version of Spark?
print(spark.version)

2.4.3


In [4]:
# end spark session
spark.stop()

In [9]:
df = spark.range(10)

In [10]:
type(df)

pyspark.sql.dataframe.DataFrame

In [11]:
df.count()

10

In [12]:
df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+



In [13]:
df.printSchema()

root
 |-- id: long (nullable = false)



In [15]:
df.dtypes, df.columns

([('id', 'bigint')], ['id'])

#### read data from csv file

In [None]:
cars = spark.read.csv('cars.csv', header=True, inferSchema=True, sep=',', nullValue='NA')
cars.show()

In [None]:
# specify column types
schema = StructType([
    StructField("maker", StringType()),
    StructField("model", StringType()),
    StructField("origin", StringType()),
    StructField("type", StringType()),
    StructField("cyl", StringType()),
    StructField("size", StringType()),
    StructField("weight", StringType()),
    StructField("length", StringType()),
    StructField("rmp", StringType()),
    StructField("consumption", StringType())
])
cars = spark.read.csv('cars.csv', header=True, schema=schema, sep=',', nullValue='NA')

#### Exercise

In [None]:
# Read data from CSV file
flights = spark.read.csv('flights.csv',
                         sep=',',
                         header=True,
                         inferSchema=True,
                         nullValue='NA')

# Get number of records
print("The data contain %d records." % flights.count())

# View the first five records
flights.show(5)

# Check column data types
flights.dtypes

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Specify column names and types
schema = StructType([
    StructField("id", IntegerType()),
    StructField("text", StringType()),
    StructField("label", IntegerType())
])

# Load data from a delimited file
sms = spark.read.csv('sms.csv', sep=';', header=False, schema=schema)

# Print schema of DataFrame
sms.printSchema()

### Data Preparation

In [None]:
# Remove the 'flight' column
flights = flights.drop('flight')

# Number of records with missing 'delay' values
flights.filter('delay IS NULL').count()

# Remove records with missing 'delay' values
flights = flights.filter('delay IS NOT NULL')

# Remove records with missing values in any column and get the number of remaining rows
flights = flights.dropna()
print(flights.count())

In [None]:
# Import the required function
from pyspark.sql.functions import round

# Convert 'mile' to 'km' and drop 'mile' column
flights_km = flights.withColumn('km', round(flights.mile * 1.60934, 0)) \
                    .drop('mile')

# Create 'label' column indicating whether flight delayed (1) or not (0)
flights_km = flights_km.withColumn('label', (flights_km.delay >= 15).cast('integer'))

# Check first five records
flights_km.show(5)

#### Categorical columns

In the flights data there are two columns, carrier and org, which hold categorical data. You need to transform those columns into indexed numerical values.

In [None]:
from pyspark.ml.feature import StringIndexer

# Create an indexer
indexer = StringIndexer(inputCol='carrier', outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(flights)

# Indexer creates a new column with numeric index values
flights_indexed = indexer_model.transform(flights)

# Repeat the process for the other categorical feature
flights_indexed = StringIndexer(inputCol='org', outputCol='org_idx').fit(flights_indexed).transform(flights_indexed)

In [None]:
# Import the necessary class
from pyspark.ml.feature import VectorAssembler

# Create an assembler object
assembler = VectorAssembler(inputCols=[
    'mon', 'dom', 'dow', 'carrier_idx', 'org_idx', 'km', 'depart', 'duration'
], outputCol='features')

# Consolidate predictor columns
flights_assembled = assembler.transform(flights)

# Check the resulting column
flights_assembled.select('features', 'delay').show(5, truncate=False)

```
+-----------------------------------------+-----+
|features                                 |delay|
+-----------------------------------------+-----+
|[0.0,22.0,2.0,0.0,0.0,509.0,16.33,82.0]  |30   |
|[2.0,20.0,4.0,0.0,1.0,542.0,6.17,82.0]   |-8   |
|[9.0,13.0,1.0,1.0,0.0,1989.0,10.33,195.0]|-5   |
|[5.0,2.0,1.0,0.0,1.0,885.0,7.98,102.0]   |2    |
|[7.0,2.0,6.0,1.0,0.0,1180.0,10.83,135.0] |54   |
+-----------------------------------------+-----+
```

## Classification

### Decision Tree

### Logistic Regression

### Turning Text into Tables

## Regression

## Ensembles & Pipelines

### Pipeline

### Cross-Validation

### Grid Search

### Ensemble