In [1]:
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown('# <span style="color:red">'+string+'</span>'))


if ('sc' in locals() or 'sc' in globals()):
    printmd('<<<<<!!!!! It seems that you are running in a IBM Watson Studio Apache Spark Notebook. Please run it in an IBM Watson Studio Default Runtime (without Apache Spark) !!!!!>>>>>')


Waiting for a Spark session to start...
Spark Initialization Done! ApplicationId = app-20200515174526-0000
KERNEL_ID = a866dfcf-63be-42d2-a327-535aa78999b8


# <span style="color:red"><<<<<!!!!! It seems that you are running in a IBM Watson Studio Apache Spark Notebook. Please run it in an IBM Watson Studio Default Runtime (without Apache Spark) !!!!!>>>>></span>

In [None]:
!pip install pyspark==2.4.5

In [None]:
try:
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SparkSession
except ImportError as e:
    printmd('<<<<<!!!!! Please restart your kernel after installing Apache Spark !!!!!>>>>>')

In [None]:
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

spark = SparkSession \
    .builder \
    .getOrCreate()

In [2]:
# delete files from previous runs
!rm -f hmp.parquet*

# download the file containing the data in PARQUET format
!wget https://github.com/IBM/coursera/raw/master/hmp.parquet
    
# create a dataframe out of it
df = spark.read.parquet('hmp.parquet')

# register a corresponding query table
df.createOrReplaceTempView('df')

--2020-05-15 17:45:39--  https://github.com/IBM/coursera/raw/master/hmp.parquet
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://github.com/IBM/skillsnetwork/raw/master/hmp.parquet [following]
--2020-05-15 17:45:39--  https://github.com/IBM/skillsnetwork/raw/master/hmp.parquet
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/IBM/skillsnetwork/master/hmp.parquet [following]
--2020-05-15 17:45:40--  https://raw.githubusercontent.com/IBM/skillsnetwork/master/hmp.parquet
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 199.232.8.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|199.232.8.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 932997 (911K) [application/octet-stream]
Saving 

Since this is supervised learning, let’s split our data into train (80%) and test (20%) set.

In [3]:
splits = df.randomSplit([0.8, 0.2])
df_train = splits[0]
df_test = splits[1]

In [6]:
df_train.show()

+---+---+---+--------------------+-------------+
|  x|  y|  z|              source|        class|
+---+---+---+--------------------+-------------+
|  0| 11| 38|Accelerometer-201...|Sitdown_chair|
|  0| 12| 39|Accelerometer-201...|Sitdown_chair|
|  0| 15| 39|Accelerometer-201...|  Brush_teeth|
|  0| 16| 31|Accelerometer-201...|    Getup_bed|
|  0| 17| 36|Accelerometer-201...|  Brush_teeth|
|  0| 23| 36|Accelerometer-201...|  Brush_teeth|
|  0| 24| 35|Accelerometer-201...|Sitdown_chair|
|  0| 25| 30|Accelerometer-201...|  Brush_teeth|
|  0| 25| 40|Accelerometer-201...|  Brush_teeth|
|  0| 26| 15|Accelerometer-201...| Climb_stairs|
|  0| 26| 42|Accelerometer-201...|  Brush_teeth|
|  0| 27| 31|Accelerometer-201...|Sitdown_chair|
|  0| 27| 37|Accelerometer-201...|  Brush_teeth|
|  0| 27| 39|Accelerometer-201...|  Brush_teeth|
|  0| 27| 41|Accelerometer-201...|  Brush_teeth|
|  0| 28| 28|Accelerometer-201...|  Brush_teeth|
|  0| 29| 17|Accelerometer-201...|    Getup_bed|
|  0| 29| 25|Acceler

Again, we can re-use our feature engineering pipeline

In [4]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer


indexer = StringIndexer(inputCol="class", outputCol="label")

vectorAssembler = VectorAssembler(inputCols=["x","y","z"],
                                  outputCol="features")

normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)

Now we use LogisticRegression, a simple and basic linear classifier to obtain a classification performance baseline.

In [5]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
pipeline = Pipeline(stages=[indexer, vectorAssembler, normalizer,lr])
model = pipeline.fit(df_train)#fitting train
prediction = model.transform(df_test)#ttransformig test set as for prediction

If we look at the schema of the prediction dataframe we see that there is an additional column called prediction which contains the best guess for the class our model predicts.

In [7]:
prediction.printSchema()

root
 |-- x: integer (nullable = true)
 |-- y: integer (nullable = true)
 |-- z: integer (nullable = true)
 |-- source: string (nullable = true)
 |-- class: string (nullable = true)
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- features_norm: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [8]:
prediction.show()

+---+---+---+--------------------+--------------+-----+---------------+--------------------+--------------------+--------------------+----------+
|  x|  y|  z|              source|         class|label|       features|       features_norm|       rawPrediction|         probability|prediction|
+---+---+---+--------------------+--------------+-----+---------------+--------------------+--------------------+--------------------+----------+
|  0| 10| 28|Accelerometer-201...|     Getup_bed|  1.0|[0.0,10.0,28.0]|[0.0,0.2631578947...|[1.25882473808995...|[0.20737737860732...|       0.0|
|  0| 27| 33|Accelerometer-201...|     Getup_bed|  1.0|[0.0,27.0,33.0]|     [0.0,0.45,0.55]|[1.25882473808995...|[0.20737737860732...|       0.0|
|  0| 28| 48|Accelerometer-201...|   Brush_teeth|  6.0|[0.0,28.0,48.0]|[0.0,0.3684210526...|[1.25882473808995...|[0.20737737860732...|       0.0|
|  0| 29| 32|Accelerometer-201...|Descend_stairs| 10.0|[0.0,29.0,32.0]|[0.0,0.4754098360...|[1.25882473808995...|[0.20737737

Let’s evaluate performance by using a build-in functionality of Apache SparkML.

In [9]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
MulticlassClassificationEvaluator().setMetricName("accuracy").evaluate(prediction) 

0.20349567245167946

So we get 20% right. This is not bad for a baseline. Note that random guessing would give us only 7%. Of course we need to improve. You might have notices that we’re dealing with a time series here. And we’re not making use of that fact right now as we look at each training example only individually. But this is ok for now. More advanced courses like “Advanced Machine Learning and Signal Processing” (https://www.coursera.org/learn/advanced-machine-learning-signal-processing/) will teach you how to improve accuracy to the nearly 100% by using algorithms like Fourier transformation or wavelet transformation. But let’s skip this for now. In the following cell, please use the RandomForest classifier (you might need to play with the “numTrees” parameter) in the code cell below. You should get an accuracy of around 44%. More on RandomForest can be found here:

https://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-classifier


In [32]:
indexer = StringIndexer(inputCol="class", outputCol="label").fit(df)

vectorAssembler = VectorAssembler(inputCols=["x","y","z"],
                                  outputCol="features")

normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)



In [34]:
from pyspark.ml.classification import RandomForestClassifier
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features_norm", numTrees=10)

from pyspark.ml.feature import IndexToString
#at the end we convert predicted labels to the corresponding string ones(good to observe for humans)
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",labels=indexer.labels)#for 'indexer.labels' does'nt work untill you fit 'indexer' to whole data

In [35]:
pipeline = Pipeline(stages=[indexer, vectorAssembler,normalizer, rf, labelConverter])

In [36]:
# Train model.  This also runs the indexer.
model = pipeline.fit(df_train)

In [37]:
# Make predictions.
predictions = model.transform(df_test)

In [38]:
predictions.select("predictedLabel", "label", "features").show(5)

+--------------+-----+---------------+
|predictedLabel|label|       features|
+--------------+-----+---------------+
|     Getup_bed|  1.0|[0.0,10.0,28.0]|
|          Walk|  1.0|[0.0,27.0,33.0]|
|     Getup_bed|  6.0|[0.0,28.0,48.0]|
|          Walk| 10.0|[0.0,29.0,32.0]|
|          Walk|  6.0|[0.0,29.0,37.0]|
+--------------+-----+---------------+
only showing top 5 rows



In [40]:
# Select (prediction, true label) and compute test error(total(1)-accuracy)
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.576651


In [42]:
#accuracy:---
print("accuracy=",accuracy)
#as guessed above 

accuracy= 0.42334858065384096


In [41]:
rfModel = model.stages[3]
print(rfModel) 

RandomForestClassificationModel (uid=RandomForestClassifier_4e47bbc8901665e64fec) with 10 trees
