## Packages

In [1]:
from pyspark.sql import SQLContext
from pyspark.sql import DataFrameNaFunctions
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer
from pyspark import SparkConf, SparkContext

In [2]:
conf = SparkConf().setMaster("local").setAppName("daily_water_classification")
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)

In [3]:
df = sqlContext.read.load("../datasets/daily_weather.csv", 
                          format="com.databricks.spark.csv",
                          header=True, inferSchema="true")

In [4]:
df.select("number").show(5)

+------+
|number|
+------+
|     0|
|     1|
|     2|
|     3|
|     4|
+------+
only showing top 5 rows



In [5]:
print(df.columns)
print(len(df.columns))

['number', 'air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am', 'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am', 'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am', 'relative_humidity_3pm']
11


The next list will allow us to define the columns in the weather data that 
we are going to use

In [6]:
featureColumns = ['air_pressure_9am',
 'air_temp_9am',
 'avg_wind_direction_9am',
 'avg_wind_speed_9am',
 'max_wind_direction_9am',
 'max_wind_speed_9am',
 'rain_accumulation_9am',
 'rain_duration_9am']
print(len(featureColumns))

8


In [7]:
df = df.drop("number") # number is a column that isn't going to use

In [8]:
df = df.na.drop() # remove all the rows with missing values

In [9]:
df.count(), len(df.columns)

(1064, 10)

A new qualitative variable, let's create to denote if the humidity is not low. If the value is less than 25%, then we want the categorical value to be 0, otherwise the categorical value should be 1.

In [10]:
binarizer = Binarizer(threshold=24.99999, inputCol="relative_humidity_3pm", outputCol="label")
binarizerDF = binarizer.transform(df)
binarizerDF.columns

['air_pressure_9am',
 'air_temp_9am',
 'avg_wind_direction_9am',
 'avg_wind_speed_9am',
 'max_wind_direction_9am',
 'max_wind_speed_9am',
 'rain_accumulation_9am',
 'rain_duration_9am',
 'relative_humidity_9am',
 'relative_humidity_3pm',
 'label']

In [11]:
binarizerDF.select("relative_humidity_3pm", "label").show(4)

+---------------------+-----+
|relative_humidity_3pm|label|
+---------------------+-----+
|   36.160000000000494|  1.0|
|     19.4265967985621|  0.0|
|   14.460000000000045|  0.0|
|   12.742547353761848|  0.0|
+---------------------+-----+
only showing top 4 rows



Using VectorAssembler we can aggregate the features that we will use to make predictions

In [12]:
assembler = VectorAssembler(inputCols=featureColumns, outputCol="features")
assembled = assembler.transform(binarizerDF)
assembled

DataFrame[air_pressure_9am: double, air_temp_9am: double, avg_wind_direction_9am: double, avg_wind_speed_9am: double, max_wind_direction_9am: double, max_wind_speed_9am: double, rain_accumulation_9am: double, rain_duration_9am: double, relative_humidity_9am: double, relative_humidity_3pm: double, label: double, features: vector]

In [13]:
type(assembled)

pyspark.sql.dataframe.DataFrame

In [14]:
(trainingData, testData) = assembled.randomSplit([0.7, 0.3], seed=13234)

In [15]:
trainingData.count(), testData.count()

(742, 322)

In [16]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=5, 
                            minInstancesPerNode=20, impurity="gini")

The way to train our model is using a Pipeline

In [17]:
pipeline = Pipeline(stages=[dt])
model = pipeline.fit(trainingData)

In [18]:
predictions = model.transform(testData)

In [19]:
predictions.select("prediction", "label").show(20)

+----------+-----+
|prediction|label|
+----------+-----+
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  0.0|
|       1.0|  1.0|
|       1.0|  0.0|
+----------+-----+
only showing top 20 rows



In [20]:
# predictions.select("prediction", "label").write.save(path="predictions.csv",
#                                                     format="com.databricks.spark.csv",
#                                                     header=True)

let's load the predictions csv

In [21]:
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [22]:
!dir

 El volumen de la unidad D es DATOS
 El n£mero de serie del volumen es: 3449-1924

 Directorio de D:\Usuarios\rhaps\Documents\GitHub\Apache-Spark\python

08/10/2019  09:14 p. m.    <DIR>          .
08/10/2019  09:14 p. m.    <DIR>          ..
01/10/2019  11:00 p. m.    <DIR>          .ipynb_checkpoints
08/10/2019  09:14 p. m.            10.449 classification.ipynb
15/09/2019  09:43 p. m.               887 customer-orders.py
01/10/2019  11:14 p. m.            21.449 Data Exploration - Pyspark.ipynb
15/09/2019  09:43 p. m.             4.400 degrees-of-separation.py
15/09/2019  09:43 p. m.               800 friends-by-age.py
15/09/2019  09:43 p. m.               946 max-temperatures.py
15/09/2019  09:43 p. m.               966 min-temperatures.py
15/09/2019  09:43 p. m.             1.001 most-popular-marveleheroe.py
15/09/2019  09:43 p. m.             3.712 movie-similarities-1m.py
15/09/2019  09:43 p. m.             3.769 movie-similarities.py
15/09/2019  09:43 p. m.             1.484 po

In [23]:
sqlContext = SQLContext(sc)
predictions = sqlContext.read.load("predictions.csv", 
                                  format="com.databricks.spark.csv",
                                  header="true",
                                  inferSchema="true")

In [24]:
# the first two arguments specify the names of the label and prediction columns,
# and the last one specifies that we want the overall precision
evaluator = MulticlassClassificationEvaluator(labelCol="label", 
                                             predictionCol="prediction",
                                             metricName="precision")

In [34]:
predictions.rdd.take(2)

[Row(prediction=1.0, label=1.0), Row(prediction=1.0, label=1.0)]

In [35]:
predictions.rdd.map(tuple).take(2)

[(1.0, 1.0), (1.0, 1.0)]

In [36]:
metrics = MulticlassMetrics(predictions.rdd.map(tuple))

In [45]:
metrics.confusionMatrix().toArray().transpose()

array([[89., 26.],
       [19., 85.]])

In [39]:
print("Accuracy = %g" %(metrics.accuracy))

Accuracy = 0.794521


In [43]:
print("Accuracy = %.2g" %(metrics.accuracy*100))

Accuracy = 79
