In [1]:
from pyspark.sql import SQLContext
from pyspark.sql import DataFrameNaFunctions
import pandas as pd
from pyspark.ml import Pipeline

In [2]:
sqlContext = SQLContext(sc)

In [3]:
dataDF = sqlContext.read.load('file:///home/cloudera/coursera/courseraDataSimulation/course4-ML/daily_weather.csv', 
                          format='com.databricks.spark.csv', 
                          header='true',inferSchema='true')

In [4]:
dataDF.columns

['number',
 'air_pressure_9am',
 'air_temp_9am',
 'avg_wind_direction_9am',
 'avg_wind_speed_9am',
 'max_wind_direction_9am',
 'max_wind_speed_9am',
 'rain_accumulation_9am',
 'rain_duration_9am',
 'relative_humidity_9am',
 'relative_humidity_3pm']

In [5]:
dataDF.describe().show()

+-------+------------------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|summary|            number|  air_pressure_9am|      air_temp_9am|avg_wind_direction_9am|avg_wind_speed_9am|max_wind_direction_9am|max_wind_speed_9am|rain_accumulation_9am|rain_duration_9am|relative_humidity_9am|relative_humidity_3pm|
+-------+------------------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|  count|              1095|              1092|              1090|                  1091|              1092|                  1092|              1091|                 1089|             1092|                 1095|                 1095|
|   mean|             547.0| 918.8825513138097| 64.933001412

In [6]:
dataDF = dataDF.drop('number') # row number
dataDF.columns

['air_pressure_9am',
 'air_temp_9am',
 'avg_wind_direction_9am',
 'avg_wind_speed_9am',
 'max_wind_direction_9am',
 'max_wind_speed_9am',
 'rain_accumulation_9am',
 'rain_duration_9am',
 'relative_humidity_9am',
 'relative_humidity_3pm']

# Hands On 1: Data Exploration

In [7]:
# Displat first few lines
dataDF.take(2)

[Row(air_pressure_9am=918.0600000000087, air_temp_9am=74.82200000000041, avg_wind_direction_9am=271.1, avg_wind_speed_9am=2.080354199999768, max_wind_direction_9am=295.39999999999986, max_wind_speed_9am=2.863283199999908, rain_accumulation_9am=0.0, rain_duration_9am=0.0, relative_humidity_9am=42.42000000000046, relative_humidity_3pm=36.160000000000494),
 Row(air_pressure_9am=917.3476881177097, air_temp_9am=71.40384263106537, avg_wind_direction_9am=101.93517935618371, avg_wind_speed_9am=2.4430092157340217, max_wind_direction_9am=140.47154847112498, max_wind_speed_9am=3.5333236016106238, rain_accumulation_9am=0.0, rain_duration_9am=0.0, relative_humidity_9am=24.328697291802207, relative_humidity_3pm=19.4265967985621)]

### Summary Statistics

In [8]:
#Columns in DataFrame
len(dataDF.columns)

10

In [9]:
#Rows in DataFrame
dataDF.count()

1095

In [10]:
# Show summary of one column
dataDF.describe("air_pressure_9am").show()

#Notice the Nan

+-------+------------------+
|summary|  air_pressure_9am|
+-------+------------------+
|  count|              1092|
|   mean| 918.8825513138097|
| stddev|3.1841611803868353|
|    min| 907.9900000000024|
|    max| 929.3200000000012|
+-------+------------------+



In [11]:
# Drop the rows with missing values on specific column and show statistics
M = dataDF.na.drop(subset=['air_pressure_9am'])

M.describe().show()

+-------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+------------------+---------------------+---------------------+
|summary|  air_pressure_9am|      air_temp_9am|avg_wind_direction_9am|avg_wind_speed_9am|max_wind_direction_9am|max_wind_speed_9am|rain_accumulation_9am| rain_duration_9am|relative_humidity_9am|relative_humidity_3pm|
+-------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+------------------+---------------------+---------------------+
|  count|              1092|              1087|                  1088|              1089|                  1089|              1088|                 1086|              1089|                 1092|                 1092|
|   mean| 918.8825513138097| 64.96896753146554|    142.09831025625775| 5.504611894676135|    148.78720908602853| 7.014192386313907| 

In [12]:
# Drop the rows with missing values and check statistics
M.describe("air_pressure_9am").show()

+-------+------------------+
|summary|  air_pressure_9am|
+-------+------------------+
|  count|              1092|
|   mean| 918.8825513138097|
| stddev|3.1841611803868353|
|    min| 907.9900000000024|
|    max| 929.3200000000012|
+-------+------------------+



# Pairwise Correlation

In [15]:
# Correlation between two columns
dataDF.stat.corr("rain_accumulation_9am","rain_duration_9am")

0.7337968783310981