In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=d0ca93e540083d947636ecb5f1c27778366b7710decefa810faa7407abdf622c
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from sklearn.metrics import accuracy_score

# Reading the Dataset

In [None]:
spark = SparkSession.builder.appName('mobileprice_lr').getOrCreate()
df = spark.read.csv('dataset.csv', header = True, inferSchema = True)
df.show()

+-------------+----+-----------+--------+---+------+----------+-----+---------+-------+---+---------+--------+----+----+----+---------+-------+------------+----+-----------+
|battery_power|blue|clock_speed|dual_sim| fc|four_g|int_memory|m_dep|mobile_wt|n_cores| pc|px_height|px_width| ram|sc_h|sc_w|talk_time|three_g|touch_screen|wifi|price_range|
+-------------+----+-----------+--------+---+------+----------+-----+---------+-------+---+---------+--------+----+----+----+---------+-------+------------+----+-----------+
|          842|   0|        2.2|       0|  1|     0|         7|  0.6|      188|      2|  2|       20|     756|2549|   9|   7|       19|      0|           0|   1|          1|
|         1021|   1|        0.5|       1|  0|     1|        53|  0.7|      136|      3|  6|      905|    1988|2631|  17|   3|        7|      1|           1|   0|          2|
|          563|   1|        0.5|       1|  2|     1|        41|  0.9|      145|      5|  6|     1263|    1716|2603|  11|   2|     

# Preparing Data for Machine Learning



In [None]:
numericCols = ['battery_power', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt',
               'n_cores','pc', 'px_height', 'px_width','ram','sc_h','sc_w','talk_time','three_g','touch_screen','wifi']

featurizationPipeline = Pipeline(stages = [VectorAssembler(inputCols=numericCols, outputCol="feature_vector")])

featurizationPipelineModel = featurizationPipeline.fit(df)
df = featurizationPipelineModel.transform(df)
train, test = df.randomSplit([0.8, 0.2], seed = 2018)

df.show()


+-------------+----+-----------+--------+---+------+----------+-----+---------+-------+---+---------+--------+----+----+----+---------+-------+------------+----+-----------+--------------------+
|battery_power|blue|clock_speed|dual_sim| fc|four_g|int_memory|m_dep|mobile_wt|n_cores| pc|px_height|px_width| ram|sc_h|sc_w|talk_time|three_g|touch_screen|wifi|price_range|      feature_vector|
+-------------+----+-----------+--------+---+------+----------+-----+---------+-------+---+---------+--------+----+----+----+---------+-------+------------+----+-----------+--------------------+
|          842|   0|        2.2|       0|  1|     0|         7|  0.6|      188|      2|  2|       20|     756|2549|   9|   7|       19|      0|           0|   1|          1|[842.0,2.2,0.0,1....|
|         1021|   1|        0.5|       1|  0|     1|        53|  0.7|      136|      3|  6|      905|    1988|2631|  17|   3|        7|      1|           1|   0|          2|[1021.0,0.5,1.0,0...|
|          563|   1|     

# ML Models


In [None]:
# Logistic Regression Model
lr = LogisticRegression(featuresCol = 'feature_vector', labelCol = 'price_range', maxIter=10)
lrModel = lr.fit(train)

# Make predictions on the test set.
predictions = lrModel.transform(test)

In [None]:
# Model Evaluation
true_labels=predictions.select('price_range')
lr_predictions=predictions.select('prediction')

accuracy = accuracy_score(true_labels.toPandas(), lr_predictions.toPandas())
print("Logistic Regression Accuracy =",accuracy*100,"%")

Logistic Regression Accuracy = 96.40102827763496 %


# Lab Exercise: Implement DecisionTreeClassifier within the existing pipeline


In [None]:
# Decision Tree Classifier Model
# Your code here
from pyspark.ml.classification import DecisionTreeClassifier

In [None]:
# Decision Tree Model
dt = DecisionTreeClassifier(featuresCol = 'feature_vector', labelCol = 'price_range', maxDepth = 10)
dtModel = dt.fit(train)

# Make predictions on the test set.
predictionsDT = dtModel.transform(test)

In [None]:
# Model Evaluation
true_labelsDT=predictionsDT.select('price_range')
lr_predictionsDT=predictionsDT.select('prediction')

accuracyDT = accuracy_score(true_labelsDT.toPandas(), lr_predictionsDT.toPandas())
print(" Accuracy =",accuracyDT*100,"%")

Logistic Regression Accuracy = 85.08997429305913 %


In [None]:
from pyspark.ml.classification import RandomForestClassifier

In [None]:
# Logistic Regression Model
rf = RandomForestClassifier(featuresCol = 'feature_vector', labelCol = 'price_range', maxDepth=16)
rfModel = rf.fit(train)

# Make predictions on the test set.
predictionsRF = rfModel.transform(test)

In [None]:
# Model Evaluation
true_labelsRF=predictionsRF.select('price_range')
lr_predictionsRF=predictionsRF.select('prediction')

accuracyRF = accuracy_score(true_labelsRF.toPandas(), lr_predictionsRF.toPandas())
print("Logistic Regression Accuracy =",accuracyRF*100,"%")

Logistic Regression Accuracy = 85.60411311053984 %
