# Analyzing data using spark

## Prepare environment variables

In [20]:
%env PROJECT=elite-caster-125113

env: PROJECT=elite-caster-125113


In [21]:
%env GS_TZCORR_FLIGHT=flights/output/events-00000-of-00004

env: GS_TZCORR_FLIGHT=flights/output/events-00000-of-00004


## Check spark version

In [37]:
%bash
spark-submit --version

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 2.3.4
      /_/
                        
Using Scala version 2.11.8, OpenJDK 64-Bit Server VM, 1.8.0_252
Branch HEAD
Compiled by user  on 2020-04-01T12:17:00Z
Revision d52bf2785a4363f4954c32bc612ba484dd385ccb
Url https://bigdataoss-internal.googlesource.com/third_party/apache/spark
Type --help for more information.


## Prepare required libraries

In [38]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import *

## Configure spark
Skip this phase when using data initialized with google script. I wrote this for practice

gs://dataproc-initialization-actions/datalab/datalab.sh

In [39]:
"""
SparkConf  
https://spark.apache.org/docs/SPARKVERSION/api/python/pyspark.html#pyspark.SparkConf

Property list
https://spark.apache.org/docs/SPARKVERSION/configuration.html
"""

# Write properties here
conf = [
  ("spark.app.name", "Bayes classification using Spark"),
  ("spark.master", "local")
]

s_conf=SparkConf() \
    .setAll(conf)

## Initilaize spark session
We will use SparkSession to create entry point to access Dataset and Dataframe APIs.

In [7]:
"""
SparkSession
https://spark.apache.org/docs/SPARKVERSION/api/python/pyspark.sql.html#pyspark.sql.SparkSession
"""
s_session = SparkSession \
    .builder \
    .config(conf=s_conf) \
    .getOrCreate()

## Define schema for parsing csv lines extracted from spark

In [40]:
"""
StructField
https://spark.apache.org/docs/SPARKVERSION/api/python/pyspark.sql.html#pyspark.sql.types.StructField
"""

def get_structfield(colname):
  # Tell spark to recognize the column as correct type instead of string
  if colname in ['ARR_DELAY', 'DEP_DELAY', 'DISTANCE']:
    return StructField(colname, FloatType(), True)
  else:
    return StructField(colname, StringType(), True)

## Import sample flight data

In [29]:

project=os.getenv('PROJECT')
gs_path=os.getenv('GS_TZCORR_FLIGHT')
input='gs://{}/{}'.format(project, gs_path)

flights = spark.read \
    .csv(inputs)