In [92]:
import pandas as pd
import numpy as np
import findspark
findspark.init('C:\spark\spark-3.1.2-bin-hadoop3.2')
import pyspark

from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from pyspark import SparkConf
from pyspark.sql.functions import *
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

# Spark And SQL Queries

In [93]:
#Defining dataframe
bus = spark.read.option("inferschema","true").option("header","true").csv("bus.csv")
bus.show(5)
bus.printSchema()

+-----------+---------------+-----------------+------+------------+----------------+-------------------+----------+
|School_Year|Busbreakdown_ID|         Run_Type|Bus_No|Route_Number|Schools_Serviced|        Occurred_On|      Town|
+-----------+---------------+-----------------+------+------------+----------------+-------------------+----------+
|       2015|        1227538|Special Ed AM Run|  2621|        J711|           75003|2015-11-05 08:10:00|New Jersey|
|       2015|        1227539|Special Ed AM Run|  1260|        M351|            6716|2015-11-05 08:10:00| Manhattan|
|       2015|        1227540|         Pre-K/EI|   418|           3|            C445|2015-11-05 08:09:00|     Bronx|
|       2015|        1227541|Special Ed AM Run|  4522|        M271|            2699|2015-11-05 08:12:00| Manhattan|
|       2015|        1227542|Special Ed AM Run|  3124|        M373|            2116|2015-11-05 08:13:00| Manhattan|
+-----------+---------------+-----------------+------+------------+-----

In [94]:
#SQL Table
bus.createOrReplaceTempView("BUS")

In [95]:
#How many breakdowns occurred in total
bus.select(count(col("Busbreakdown_ID"))).show()

spark.sql('SELECT COUNT(Busbreakdown_ID) FROM BUS').show()

+----------------------+
|count(Busbreakdown_ID)|
+----------------------+
|                   101|
+----------------------+

+----------------------+
|count(Busbreakdown_ID)|
+----------------------+
|                   101|
+----------------------+



In [71]:
#How many breakdowns occurred in total in each town
bus.select(col("Town"),col("Busbreakdown_ID")).groupBy(col("Town")).count().show()

spark.sql('SELECT Town, COUNT(Busbreakdown_ID) AS Count FROM BUS GROUP BY Town').show()

+---------------+-----+
|           Town|count|
+---------------+-----+
|         Queens|   14|
|  Nassau County|    2|
|           null|    7|
|       Brooklyn|   14|
|  Staten Island|    4|
|    Westchester|    4|
|     New Jersey|    1|
|Rockland County|    1|
|      Manhattan|   22|
|          Bronx|   32|
+---------------+-----+

+---------------+-----+
|           Town|Count|
+---------------+-----+
|         Queens|   14|
|  Nassau County|    2|
|           null|    7|
|       Brooklyn|   14|
|  Staten Island|    4|
|    Westchester|    4|
|     New Jersey|    1|
|Rockland County|    1|
|      Manhattan|   22|
|          Bronx|   32|
+---------------+-----+



In [77]:
#How many breakdowns occurred in total in each town per year
bus.select(col("School_Year"),col("Town"),col("Busbreakdown_ID")).groupBy("School_Year","Town").count().orderBy("School_Year").show()

#spark.sql('SELECT School_Year, Town, Count(Busbreakdown_ID) AS Count FROM BUS GROUP BY Town').show()

+-----------+---------------+-----+
|School_Year|           Town|count|
+-----------+---------------+-----+
|       2015|Rockland County|    1|
|       2015|       Brooklyn|    1|
|       2015|           null|    2|
|       2015|          Bronx|    4|
|       2015|  Nassau County|    1|
|       2015|      Manhattan|    7|
|       2015|     New Jersey|    1|
|       2015|         Queens|    1|
|       2015|    Westchester|    3|
|       2016|         Queens|    3|
|       2016|       Brooklyn|    2|
|       2016|          Bronx|    5|
|       2016|      Manhattan|    5|
|       2016|    Westchester|    1|
|       2017|           null|    2|
|       2017|          Bronx|    7|
|       2017|       Brooklyn|    2|
|       2017|         Queens|    4|
|       2017|      Manhattan|    1|
|       2018|          Bronx|    5|
+-----------+---------------+-----+
only showing top 20 rows



In [6]:
#How many breakdowns occurred in total in each route number per year
bus.select(col("School_Year"),col("Route_Number"),col("Busbreakdown_ID")).groupBy("School_Year","Route_Number").count().orderBy("School_Year").show()

+-----------+------------+-----+
|School_Year|Route_Number|count|
+-----------+------------+-----+
|       2015|        M351|    1|
|       2015|          10|    1|
|       2015|        U901|    1|
|       2015|        J711|    1|
|       2015|        W991|    1|
|       2015|        P464|    1|
|       2015|           8|    1|
|       2015|        M922|    1|
|       2015|        N588|    1|
|       2015|        W796|    1|
|       2015|        M373|    1|
|       2015|        L524|    1|
|       2015|        M126|    1|
|       2015|        M678|    1|
|       2015|        P877|    1|
|       2015|        P662|    1|
|       2015|        W633|    1|
|       2015|           3|    1|
|       2015|        L531|    1|
|       2015|        M490|    1|
+-----------+------------+-----+
only showing top 20 rows



In [80]:
#how many breakdowns occur each year in brooklyn
#bus.filter(col("Town")=="Brooklyn").select("School_Year","Busbreakdown_ID").groupBy("School_Year").count().show()
#bus.select("School_Year",sum("Busbreakdown_ID").alias("Total")).filter(col("Town")=="Brooklyn").groupBy("School_Year").show()
bus.filter(col("Town")=="Brooklyn").groupBy("School_Year").agg(sum("Busbreakdown_ID").alias("Total")).orderBy("School_Year").show()

#spark.sql('SELECT School_Year,COUNT(Busbreakdown_ID) FROM BUS WHERE Town = Brooklyn').show()

+-----------+-------+
|School_Year|  Total|
+-----------+-------+
|       2015|1227554|
|       2016|2455163|
|       2017|2455188|
|       2018|6136775|
|       2020|5295244|
+-----------+-------+



# Timestamp

In [102]:
bus.select(hour("Occurred_On").alias("Hour")).show(5)
#minute,second,year,quarter,current_date,date_format,next_day,last_day,window

spark.sql('SELECT HOUR(Occurred_On) AS Hour FROM BUS').show()
#spark.sql('SELECT YEAR(Occurred_On) AS Year FROM BUS').show()

#Convert String to timestamp
#df.withColumn("timestamp",to_timestamp("input_timestamp")).show(truncate=False)

+----+
|Hour|
+----+
|   8|
|   8|
|   8|
|   8|
|   8|
+----+
only showing top 5 rows

+----+
|Hour|
+----+
|   8|
|   8|
|   8|
|   8|
|   8|
|   7|
|   8|
|   8|
|   7|
|   8|
|   8|
|   8|
|   8|
|   8|
|   8|
|   8|
|   8|
|   6|
|   8|
|   8|
+----+
only showing top 20 rows



# sort min max sum agg count

In [82]:
#sort min max sum agg count
#bus.select("Busbreakdown_ID").sort("Busbreakdown_ID").show()
#bus.agg({'Busbreakdown_ID' : 'max'}).show()
#bus.select(max('Busbreakdown_ID')).show()
#bus.select(sum('Busbreakdown_ID')).show()
#bus.select(count('Busbreakdown_ID')).show()
#bus.agg({'Busbreakdown_ID' : 'count'}).show()

#spark.sql('SELECT MAX(Busbreakdown_ID) AS Max FROM BUS').show()
#spark.sql('SELECT MIN(Busbreakdown_ID) AS Min FROM BUS').show()

+-------+
|    Max|
+-------+
|1215511|
+-------+



# mean variance standard deviation

In [91]:
#mean variance standard deviation
#bus.agg({'Busbreakdown_ID' : 'mean'}).show()
#bus.select(mean('Busbreakdown_ID')).show()
#spark.sql('SELECT AVG(Busbreakdown_ID) AS Mean FROM BUS').show()
#spark.sql('SELECT MEAN(Busbreakdown_ID) AS Mean FROM BUS').show()


#bus.agg({'Busbreakdown_ID' : 'stddev'}).show() #stddev_pop
#bus.select(stddev('Busbreakdown_ID')).show()
#spark.sql('SELECT STDDEV(Busbreakdown_ID) AS stddev FROM BUS').show()

#bus.select(variance('Busbreakdown_ID')).show()
#spark.sql('SELECT VARIANCE(Busbreakdown_ID) AS Var FROM BUS').show()
#var_pop

#sum

#sort
bus.select("School_Year","Town","Busbreakdown_ID").sort("School_Year").show()

spark.sql('SELECT School_Year,Town,Busbreakdown_ID FROM BUS SORT BY School_Year ASC').show()

+-----------+---------------+---------------+
|School_Year|           Town|Busbreakdown_ID|
+-----------+---------------+---------------+
|       2015|     New Jersey|        1227538|
|       2015|      Manhattan|        1227539|
|       2015|          Bronx|        1227540|
|       2015|      Manhattan|        1227541|
|       2015|      Manhattan|        1227542|
|       2015|    Westchester|        1227543|
|       2015|    Westchester|        1227544|
|       2015|      Manhattan|        1227545|
|       2015|      Manhattan|        1227546|
|       2015|      Manhattan|        1227547|
|       2015|      Manhattan|        1227551|
|       2015|          Bronx|        1227553|
|       2015|       Brooklyn|        1227554|
|       2015|Rockland County|        1227555|
|       2015|  Nassau County|        1227556|
|       2015|    Westchester|        1227557|
|       2015|           null|        1227558|
|       2015|           null|        1227560|
|       2015|          Bronx|     

In [28]:
#aggreagte multiple groupby orderby columns
bus.groupBy("School_Year").agg(count("Busbreakdown_ID").alias("Breakdown"), count("Run_Type").alias("Run"), count("Town").alias("Town")).orderBy("School_Year").show()

+-----------+---------+---+----+
|School_Year|Breakdown|Run|Town|
+-----------+---------+---+----+
|       2015|       21| 21|  19|
|       2016|       16| 16|  16|
|       2017|       16| 16|  14|
|       2018|       20| 20|  19|
|       2019|       16| 16|  14|
|       2020|       12| 12|  12|
+-----------+---------+---+----+



# function to return dataframe

In [53]:
#function to return dataframe
def calculate_mean(dataframe):
    new = dataframe.select([mean(i) for i in dataframe.columns])
    return new.show()

def calculate_variance(dataframe):
    new = dataframe.select([variance(i) for i in dataframe.columns])
    return new.show()

def calculate_stddev(dataframe):
    new = dataframe.select([stddev(i).alias(i) for i in dataframe.columns])
    return new.show()

In [54]:
df = bus.select("School_Year","Busbreakdown_ID","Run_Type")

In [57]:
calculate_mean(df)
calculate_variance(df)
calculate_stddev(df)

+------------------+--------------------+-------------+
|  avg(School_Year)|avg(Busbreakdown_ID)|avg(Run_Type)|
+------------------+--------------------+-------------+
|2017.2970297029703|  1236901.5346534653|         null|
+------------------+--------------------+-------------+

+---------------------+-------------------------+------------------+
|var_samp(School_Year)|var_samp(Busbreakdown_ID)|var_samp(Run_Type)|
+---------------------+-------------------------+------------------+
|    2.830891089109823|      1.589206950211282E9|              null|
+---------------------+-------------------------+------------------+

+-----------------+-----------------+--------+
|      School_Year|  Busbreakdown_ID|Run_Type|
+-----------------+-----------------+--------+
|1.682525212027987|39864.85858762429|    null|
+-----------------+-----------------+--------+



In [67]:
#Calculate average by row
bus.select(((col("School_Year") + col("Busbreakdown_ID"))/2).alias("Mean")).orderBy("School_Year").show()

+--------+
|    Mean|
+--------+
|614776.5|
|614777.0|
|614777.5|
|614778.0|
|614778.5|
|614779.0|
|614779.5|
|614780.0|
|614780.5|
|614781.0|
|614783.0|
|614784.0|
|614784.5|
|614785.0|
|614785.5|
|614786.0|
|614786.5|
|614787.5|
|614788.0|
|614789.5|
+--------+
only showing top 20 rows



# Practice