In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
!curl https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv -o tips.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  9729  100  9729    0     0  52692      0 --:--:-- --:--:-- --:--:-- 57229


In [3]:
!head tips.csv

"total_bill","tip","sex","smoker","day","time","size"
16.99,1.01,"Female","No","Sun","Dinner",2
10.34,1.66,"Male","No","Sun","Dinner",3
21.01,3.5,"Male","No","Sun","Dinner",3
23.68,3.31,"Male","No","Sun","Dinner",2
24.59,3.61,"Female","No","Sun","Dinner",4
25.29,4.71,"Male","No","Sun","Dinner",4
8.77,2,"Male","No","Sun","Dinner",2
26.88,3.12,"Male","No","Sun","Dinner",4
15.04,1.96,"Male","No","Sun","Dinner",2


In [4]:
df = spark.read.csv('tips.csv', header=True, inferSchema=True)

In [5]:
df.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [7]:
df.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



In [8]:
from pyspark.ml.feature import RFormula

In [9]:
rf = RFormula(formula="tip ~ sex*smoker") # same thing as sex + smoker + sex:smoker
                                          # y = b0 + b1*sex + b2*smoker + b3*sex*smoker

In [12]:
rf_model = rf.fit(df)

In [14]:
df2 = rf_model.transform(df)

In [15]:
df2.show(5)

+----------+----+------+------+---+------+----+--------------------+-----+
|total_bill| tip|   sex|smoker|day|  time|size|            features|label|
+----------+----+------+------+---+------+----+--------------------+-----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|[16.99,0.0,1.0,0....| 1.01|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|[10.34,1.0,1.0,0....| 1.66|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|[21.01,1.0,1.0,0....|  3.5|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|[23.68,1.0,1.0,0....| 3.31|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|[24.59,0.0,1.0,0....| 3.61|
+----------+----+------+------+---+------+----+--------------------+-----+
only showing top 5 rows



# Decision Trees

In [17]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [18]:
!curl https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_libsvm_data.txt -o sample_libsvm_data.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  102k  100  102k    0     0   634k      0 --:--:-- --:--:-- --:--:--  700k


In [19]:
data = spark.read.format("libsvm").load("sample_libsvm_data.txt")
data.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
+-----+--------------------+
only showing top 5 rows



In [20]:
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = (
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)
)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
model = pipeline.fit(trainingData)


In [23]:
out_df = model.transform(testData)

In [25]:
out_df.show(5)

+-----+--------------------+------------+--------------------+-------------+-----------+----------+
|label|            features|indexedLabel|     indexedFeatures|rawPrediction|probability|prediction|
+-----+--------------------+------------+--------------------+-------------+-----------+----------+
|  0.0|(692,[98,99,100,1...|         1.0|(692,[98,99,100,1...|   [0.0,30.0]|  [0.0,1.0]|       1.0|
|  0.0|(692,[123,124,125...|         1.0|(692,[123,124,125...|   [0.0,30.0]|  [0.0,1.0]|       1.0|
|  0.0|(692,[123,124,125...|         1.0|(692,[123,124,125...|   [0.0,30.0]|  [0.0,1.0]|       1.0|
|  0.0|(692,[123,124,125...|         1.0|(692,[123,124,125...|   [0.0,30.0]|  [0.0,1.0]|       1.0|
|  0.0|(692,[126,127,128...|         1.0|(692,[126,127,128...|   [0.0,30.0]|  [0.0,1.0]|       1.0|
+-----+--------------------+------------+--------------------+-------------+-----------+----------+
only showing top 5 rows

