In [1]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('BDAS').getOrCreate()

In [None]:
df = spark.read.csv('../DataSet/PRSA_Data_Aotizhongxin_20130301-20170228.csv', header=True, inferSchema=True)


In [None]:
#Visualise DataFrame
df.show()
df.count()

In [None]:
#Selecting items(rows): year between 2015-2017
df.createOrReplaceTempView('dataset')
results = spark.sql("SELECT * FROM dataset WHERE year > 2014")
results.show()
results.count()


In [None]:
#Example for subset of  dataset with 100 rows
new_results = spark.sql("SELECT * FROM dataset  limit 100")
new_results.show()
new_results.count()

In [None]:
# check number of columns
results.show()
print(results.columns)
print("Number of columns: ",len(results.columns))

In [None]:
#Selecting attribute
afterDrop = results.drop(*["No","PM10"])
afterDrop.show()
print(afterDrop.columns)
print("Number of columns: ",len(afterDrop.columns))

In [None]:
#rename column name
rename_df = afterDrop.withColumnRenamed("DEWP","DewPointTempeature") \
    .withColumnRenamed("wd","WindDirection")\
    .withColumnRenamed("WSPM","WindSpend")\
    .withColumnRenamed("PM2.5","PM25")
rename_df.show()

In [2]:
# create correct data schema.
from pyspark.sql.types import *
schema = StructType([StructField('No',IntegerType(),True),
               StructField('year',IntegerType(),True),
               StructField('month',IntegerType(),True),
               StructField('day',IntegerType(),True),
               StructField('hour',IntegerType(),True),
               StructField('PM2.5',FloatType(),True),
               StructField('PM10',FloatType(),True),
               StructField('SO2',FloatType(),True),
               StructField('NO2',FloatType(),True),
               StructField('CO',FloatType(),True),
               StructField('O3',FloatType(),True),
               StructField('TEMP',FloatType(),True),
               StructField('PRES',FloatType(),True),
               StructField('DEWP',FloatType(),True),
               StructField('RAIN',FloatType(),True),
               StructField('wd',StringType(),True),
               StructField('WSPM',FloatType(),True),
               StructField('station',StringType(),True)])

In [3]:
#import in the dataset with relevat types


df_with_schema = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("../DataSet/testv1.csv")
df_with_schema.show()

+---+----+-----+---+----+-----+----+----+----+-----+----+----+------+-----+----+---+----+-------------+
| No|year|month|day|hour|PM2.5|PM10| SO2| NO2|   CO|  O3|TEMP|  PRES| DEWP|RAIN| wd|WSPM|      station|
+---+----+-----+---+----+-----+----+----+----+-----+----+----+------+-----+----+---+----+-------------+
|  1|2013|    3|  1|   0|  9.0| 9.0| 6.0|17.0|200.0|62.0| 0.3|1021.9|-19.0| 0.0|WNW| 2.0|Wanshouxigong|
|  2|2013|    3|  1|   1| 11.0|11.0| 7.0|14.0|200.0|66.0|-0.1|1022.4|-19.3| 0.0|WNW| 4.4|Wanshouxigong|
|  3|2013|    3|  1|   2|  8.0| 8.0|null|16.0|200.0|59.0|-0.6|1022.6|-19.7| 0.0|WNW| 4.7|Wanshouxigong|
|  4|2013|    3|  1|   3|  8.0| 8.0| 3.0|16.0| null|null|-0.7|1023.5|-20.9| 0.0| NW| 2.6|Wanshouxigong|
|  5|2013|    3|  1|   4|  8.0| 8.0| 3.0|null|300.0|36.0|-0.9|1024.1|-21.7| 0.0|WNW| 2.5|Wanshouxigong|
|  6|2013|    3|  1|   5| 10.0|10.0| 4.0| 8.0|200.0|64.0|-1.6|1024.7|-21.1| 0.0| NE| 2.0|Wanshouxigong|
|  7|2013|    3|  1|   6|  8.0| 8.0| 6.0|13.0|300.0|61.0|-2.4|10

In [None]:
df_with_schema.printSchema()

In [4]:
df_with_schema.createOrReplaceTempView('new_dataset')
new_results = spark.sql("SELECT * FROM new_dataset WHERE year > 2014")
new_results.show()
new_results.count()

+-----+----+-----+---+----+-----+-----+----+-----+------+----+----+------+-----+----+---+----+-------------+
|   No|year|month|day|hour|PM2.5| PM10| SO2|  NO2|    CO|  O3|TEMP|  PRES| DEWP|RAIN| wd|WSPM|      station|
+-----+----+-----+---+----+-----+-----+----+-----+------+----+----+------+-----+----+---+----+-------------+
|16105|2015|    1|  1|   0|  3.0| 21.0|10.0| 16.0| 400.0|54.0|-1.0|1027.0|-23.0| 0.0| NW| 0.9|Wanshouxigong|
|16106|2015|    1|  1|   1|  3.0| 13.0|11.0| 17.0| 400.0|53.0| 0.0|1025.0|-22.9| 0.0| NW| 2.7|Wanshouxigong|
|16107|2015|    1|  1|   2|  3.0| 13.0|10.0| 15.0| 400.0|55.0| 0.0|1027.0|-22.9| 0.0| NW| 2.4|Wanshouxigong|
|16108|2015|    1|  1|   3|  4.0| 18.0|13.0| 13.0| 400.0|57.0| 0.0|1028.0|-24.4| 0.0| NW| 2.4|Wanshouxigong|
|16109|2015|    1|  1|   4|  4.0| 15.0|15.0| 12.0| 400.0|58.0| 0.0|1030.0|-24.4| 0.0| NW| 2.4|Wanshouxigong|
|16110|2015|    1|  1|   5|  3.0| 15.0|13.0| 15.0| 500.0|55.0|-1.0|1024.0|-24.4| 0.0|WNW| 3.2|Wanshouxigong|
|16111|2015|    1| 

18960

In [6]:
#Selecting attribute
new_afterDrop = new_results.drop(*["No","PM10"])
new_afterDrop.show()
print(new_afterDrop.columns)
print("Number of columns: ",len(new_afterDrop.columns))

+----+-----+---+----+-----+----+-----+------+----+----+------+-----+----+---+----+-------------+
|year|month|day|hour|PM2.5| SO2|  NO2|    CO|  O3|TEMP|  PRES| DEWP|RAIN| wd|WSPM|      station|
+----+-----+---+----+-----+----+-----+------+----+----+------+-----+----+---+----+-------------+
|2015|    1|  1|   0|  3.0|10.0| 16.0| 400.0|54.0|-1.0|1027.0|-23.0| 0.0| NW| 0.9|Wanshouxigong|
|2015|    1|  1|   1|  3.0|11.0| 17.0| 400.0|53.0| 0.0|1025.0|-22.9| 0.0| NW| 2.7|Wanshouxigong|
|2015|    1|  1|   2|  3.0|10.0| 15.0| 400.0|55.0| 0.0|1027.0|-22.9| 0.0| NW| 2.4|Wanshouxigong|
|2015|    1|  1|   3|  4.0|13.0| 13.0| 400.0|57.0| 0.0|1028.0|-24.4| 0.0| NW| 2.4|Wanshouxigong|
|2015|    1|  1|   4|  4.0|15.0| 12.0| 400.0|58.0| 0.0|1030.0|-24.4| 0.0| NW| 2.4|Wanshouxigong|
|2015|    1|  1|   5|  3.0|13.0| 15.0| 500.0|55.0|-1.0|1024.0|-24.4| 0.0|WNW| 3.2|Wanshouxigong|
|2015|    1|  1|   6|  3.0|10.0| 30.0| 500.0|39.0|-4.0|1029.0|-24.3| 0.0|  W| 0.9|Wanshouxigong|
|2015|    1|  1|   7|  3.0|10.

In [7]:
#rename column name
new_rename_df = new_afterDrop.withColumnRenamed("DEWP","DewPointTempeature") \
    .withColumnRenamed("wd","WindDirection")\
    .withColumnRenamed("WSPM","WindSpend")\
    .withColumnRenamed("PM2.5","PM25")
new_rename_df.show()

+----+-----+---+----+----+----+-----+------+----+----+------+------------------+----+-------------+---------+-------------+
|year|month|day|hour|PM25| SO2|  NO2|    CO|  O3|TEMP|  PRES|DewPointTempeature|RAIN|WindDirection|WindSpend|      station|
+----+-----+---+----+----+----+-----+------+----+----+------+------------------+----+-------------+---------+-------------+
|2015|    1|  1|   0| 3.0|10.0| 16.0| 400.0|54.0|-1.0|1027.0|             -23.0| 0.0|           NW|      0.9|Wanshouxigong|
|2015|    1|  1|   1| 3.0|11.0| 17.0| 400.0|53.0| 0.0|1025.0|             -22.9| 0.0|           NW|      2.7|Wanshouxigong|
|2015|    1|  1|   2| 3.0|10.0| 15.0| 400.0|55.0| 0.0|1027.0|             -22.9| 0.0|           NW|      2.4|Wanshouxigong|
|2015|    1|  1|   3| 4.0|13.0| 13.0| 400.0|57.0| 0.0|1028.0|             -24.4| 0.0|           NW|      2.4|Wanshouxigong|
|2015|    1|  1|   4| 4.0|15.0| 12.0| 400.0|58.0| 0.0|1030.0|             -24.4| 0.0|           NW|      2.4|Wanshouxigong|
|2015|  

In [8]:
new_rename_df.show()

+----+-----+---+----+----+----+-----+------+----+----+------+------------------+----+-------------+---------+-------------+
|year|month|day|hour|PM25| SO2|  NO2|    CO|  O3|TEMP|  PRES|DewPointTempeature|RAIN|WindDirection|WindSpend|      station|
+----+-----+---+----+----+----+-----+------+----+----+------+------------------+----+-------------+---------+-------------+
|2015|    1|  1|   0| 3.0|10.0| 16.0| 400.0|54.0|-1.0|1027.0|             -23.0| 0.0|           NW|      0.9|Wanshouxigong|
|2015|    1|  1|   1| 3.0|11.0| 17.0| 400.0|53.0| 0.0|1025.0|             -22.9| 0.0|           NW|      2.7|Wanshouxigong|
|2015|    1|  1|   2| 3.0|10.0| 15.0| 400.0|55.0| 0.0|1027.0|             -22.9| 0.0|           NW|      2.4|Wanshouxigong|
|2015|    1|  1|   3| 4.0|13.0| 13.0| 400.0|57.0| 0.0|1028.0|             -24.4| 0.0|           NW|      2.4|Wanshouxigong|
|2015|    1|  1|   4| 4.0|15.0| 12.0| 400.0|58.0| 0.0|1030.0|             -24.4| 0.0|           NW|      2.4|Wanshouxigong|
|2015|  

In [15]:
new_rename_df.count()

18960

In [19]:
after_delete = new_rename_df.na.drop()

In [21]:
after_delete.show()
print("Before, number of rows:", new_rename_df.count())
print("After, number of rows:" , after_delete.count())

+----+-----+---+----+----+----+-----+------+----+----+------+------------------+----+-------------+---------+-------------+
|year|month|day|hour|PM25| SO2|  NO2|    CO|  O3|TEMP|  PRES|DewPointTempeature|RAIN|WindDirection|WindSpend|      station|
+----+-----+---+----+----+----+-----+------+----+----+------+------------------+----+-------------+---------+-------------+
|2015|    1|  1|   0| 3.0|10.0| 16.0| 400.0|54.0|-1.0|1027.0|             -23.0| 0.0|           NW|      0.9|Wanshouxigong|
|2015|    1|  1|   1| 3.0|11.0| 17.0| 400.0|53.0| 0.0|1025.0|             -22.9| 0.0|           NW|      2.7|Wanshouxigong|
|2015|    1|  1|   2| 3.0|10.0| 15.0| 400.0|55.0| 0.0|1027.0|             -22.9| 0.0|           NW|      2.4|Wanshouxigong|
|2015|    1|  1|   3| 4.0|13.0| 13.0| 400.0|57.0| 0.0|1028.0|             -24.4| 0.0|           NW|      2.4|Wanshouxigong|
|2015|    1|  1|   4| 4.0|15.0| 12.0| 400.0|58.0| 0.0|1030.0|             -24.4| 0.0|           NW|      2.4|Wanshouxigong|
|2015|  

In [24]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
def NationalQuanlityRemark(value):
    if   value <= 50: 
        return 'Good'
    elif value > 50 and value <= 100:
        return "Satisfactory"
    elif value > 100 and value <= 200:
        return 'Moderate'
    elif value > 200 and value <= 300:
        return 'Poor'
    elif value > 300 and value <=400:
        return 'Very Poor'
    else:
        return 'Severe'

In [37]:
udfsomefunc = F.udf(NationalQuanlityRemark, StringType())
NQR = after_delete.withColumn("NQR", udfsomefunc("PM25"))
NQR.show()

+----+-----+---+----+----+----+-----+------+----+----+------+------------------+----+-------------+---------+-------------+------------+
|year|month|day|hour|PM25| SO2|  NO2|    CO|  O3|TEMP|  PRES|DewPointTempeature|RAIN|WindDirection|WindSpend|      station|         NQR|
+----+-----+---+----+----+----+-----+------+----+----+------+------------------+----+-------------+---------+-------------+------------+
|2015|    1|  1|   0| 3.0|10.0| 16.0| 400.0|54.0|-1.0|1027.0|             -23.0| 0.0|           NW|      0.9|Wanshouxigong|        Good|
|2015|    1|  1|   1| 3.0|11.0| 17.0| 400.0|53.0| 0.0|1025.0|             -22.9| 0.0|           NW|      2.7|Wanshouxigong|        Good|
|2015|    1|  1|   2| 3.0|10.0| 15.0| 400.0|55.0| 0.0|1027.0|             -22.9| 0.0|           NW|      2.4|Wanshouxigong|        Good|
|2015|    1|  1|   3| 4.0|13.0| 13.0| 400.0|57.0| 0.0|1028.0|             -24.4| 0.0|           NW|      2.4|Wanshouxigong|        Good|
|2015|    1|  1|   4| 4.0|15.0| 12.0| 400

In [51]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
def season(value):
    if   value >=4 and value <6: 
        return 'Spring'
    elif value >=6 and value<9:
        return "Summer"
    elif value > 9 and value <= 11:
        return 'Autumn'
    else:
        return 'Winter'

In [52]:
Seasonfunc = F.udf(season, StringType())
new_data = NQR.withColumn("Season", Seasonfunc("year"))
new_data.show()

+----+-----+---+----+----+----+-----+------+----+----+------+------------------+----+-------------+---------+-------------+------------+------+
|year|month|day|hour|PM25| SO2|  NO2|    CO|  O3|TEMP|  PRES|DewPointTempeature|RAIN|WindDirection|WindSpend|      station|         NQR|Season|
+----+-----+---+----+----+----+-----+------+----+----+------+------------------+----+-------------+---------+-------------+------------+------+
|2015|    1|  1|   0| 3.0|10.0| 16.0| 400.0|54.0|-1.0|1027.0|             -23.0| 0.0|           NW|      0.9|Wanshouxigong|        Good|Winter|
|2015|    1|  1|   1| 3.0|11.0| 17.0| 400.0|53.0| 0.0|1025.0|             -22.9| 0.0|           NW|      2.7|Wanshouxigong|        Good|Winter|
|2015|    1|  1|   2| 3.0|10.0| 15.0| 400.0|55.0| 0.0|1027.0|             -22.9| 0.0|           NW|      2.4|Wanshouxigong|        Good|Winter|
|2015|    1|  1|   3| 4.0|13.0| 13.0| 400.0|57.0| 0.0|1028.0|             -24.4| 0.0|           NW|      2.4|Wanshouxigong|        Good|