In [1]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

emp_df = spark.read.csv('employee.txt', header = True)
emp_df

DataFrame[id: string, last_name: string, email: string, gender: string, department: string, start_date: string, salary: string, job_title: string, region_id: string]

In [2]:
emp_df.schema

StructType(List(StructField(id,StringType,true),StructField(last_name,StringType,true),StructField(email,StringType,true),StructField(gender,StringType,true),StructField(department,StringType,true),StructField(start_date,StringType,true),StructField(salary,StringType,true),StructField(job_title,StringType,true),StructField(region_id,StringType,true)))

In [3]:
emp_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- department: string (nullable = true)
 |-- start_date: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- job_title: string (nullable = true)
 |-- region_id: string (nullable = true)



In [4]:
emp_df.columns

['id',
 'last_name',
 'email',
 'gender',
 'department',
 'start_date',
 'salary',
 'job_title',
 'region_id']

In [5]:
emp_df.take(5)

[Row(id='1', last_name="'Kelley'", email="'rkelley0@soundcloud.com'", gender="'Female'", department="'Computers'", start_date="'10/2/2009'", salary='67470', job_title="'Structural Engineer'", region_id='2'),
 Row(id='2', last_name="'Armstrong'", email="'sarmstrong1@infoseek.co.jp'", gender="'Male'", department="'Sports'", start_date="'3/31/2008'", salary='71869', job_title="'Financial Advisor'", region_id='2'),
 Row(id='3', last_name="'Carr'", email="'fcarr2@woothemes.com'", gender="'Male'", department="'Automotive'", start_date="'7/12/2009'", salary='101768', job_title="'Recruiting Manager'", region_id='3'),
 Row(id='4', last_name="'Murray'", email="'jmurray3@gov.uk'", gender="'Female'", department="'Jewelery'", start_date="'12/25/2014'", salary='96897', job_title="'Desktop Support Technician'", region_id='3'),
 Row(id='5', last_name="'Ellis'", email="'jellis4@sciencedirect.com'", gender="'Female'", department="'Grocery'", start_date="'9/19/2002'", salary='63702', job_title="'Software

In [6]:
emp_df.count()

1000

In [7]:
sample_df = emp_df.sample(False, 0.1)
sample_df.count()

85

In [8]:
emp_mgrs_df = emp_df.filter('salary >= 100000')
emp_mgrs_df.count()

478

In [9]:
emp_mgrs_df.select('salary').show()

+------+
|salary|
+------+
|101768|
|118497|
|108657|
|108093|
|121966|
|141139|
|106659|
|148952|
|109890|
|115274|
|144724|
|126103|
|144965|
|113507|
|120579|
|107222|
|125668|
|113857|
|108378|
|133424|
+------+
only showing top 20 rows



In [10]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors

features_df = spark.createDataFrame([
    (1, Vectors.dense([10.0, 10000.0, 1.0]),),
    (2, Vectors.dense([20.0, 30000.0, 2.0]),),
    (3, Vectors.dense([30.0, 40000.0, 3.0]),)
], ['id', 'features'])

features_df.take(1)

[Row(id=1, features=DenseVector([10.0, 10000.0, 1.0]))]

In [11]:
feature_scaler = MinMaxScaler(inputCol='features', outputCol='sfeatures')
smodel = feature_scaler.fit(features_df)
sfeatures_df = smodel.transform(features_df)
sfeatures_df.take(1)

[Row(id=1, features=DenseVector([10.0, 10000.0, 1.0]), sfeatures=DenseVector([0.0, 0.0, 0.0]))]

In [12]:
sfeatures_df.select('features', 'sfeatures').show()

+------------------+--------------------+
|          features|           sfeatures|
+------------------+--------------------+
|[10.0,10000.0,1.0]|       [0.0,0.0,0.0]|
|[20.0,30000.0,2.0]|[0.5,0.6666666666...|
|[30.0,40000.0,3.0]|       [1.0,1.0,1.0]|
+------------------+--------------------+



In [13]:
from pyspark.ml.feature import StandardScaler

features_df = spark.createDataFrame([
    (1, Vectors.dense([10.0, 10000.00, 1.0]),),
    (2, Vectors.dense([20.0, 30000.00, 2.0]),),
    (3, Vectors.dense([30.0, 40000.00, 3.0]),)
], ['id', 'features'])
features_df.take(1)

[Row(id=1, features=DenseVector([10.0, 10000.0, 1.0]))]

In [14]:
feature_stand_scaler = StandardScaler(inputCol='features', outputCol='sfeatures', withStd=True, withMean=True)
stand_smodel = feature_stand_scaler.fit(features_df)
stand_sfeatures_df = stand_smodel.transform(features_df)
stand_sfeatures_df.take(1)

[Row(id=1, features=DenseVector([10.0, 10000.0, 1.0]), sfeatures=DenseVector([-1.0, -1.0911, -1.0]))]

In [15]:
stand_sfeatures_df.show()

+---+------------------+--------------------+
| id|          features|           sfeatures|
+---+------------------+--------------------+
|  1|[10.0,10000.0,1.0]|[-1.0,-1.09108945...|
|  2|[20.0,30000.0,2.0]|[0.0,0.2182178902...|
|  3|[30.0,40000.0,3.0]|[1.0,0.8728715609...|
+---+------------------+--------------------+



In [16]:
from pyspark.ml.feature import Bucketizer
splits = [-float('inf'), -10.0, 0.0, 10.0, float('inf')]

b_data = [(-800.0,), (-10.5,), (-1.7,), (0.0,), (8.2,), (90.1,)]
b_df = spark.createDataFrame(b_data, ['features'])
b_df.show()

+--------+
|features|
+--------+
|  -800.0|
|   -10.5|
|    -1.7|
|     0.0|
|     8.2|
|    90.1|
+--------+



In [17]:
from pyspark.ml.feature import Bucketizer
bucketizer = Bucketizer(splits = splits, inputCol = 'features', outputCol='bfeatures')
bucketed_df = bucketizer.transform(b_df)
bucketed_df.show()

+--------+---------+
|features|bfeatures|
+--------+---------+
|  -800.0|      0.0|
|   -10.5|      0.0|
|    -1.7|      1.0|
|     0.0|      2.0|
|     8.2|      2.0|
|    90.1|      3.0|
+--------+---------+



In [18]:
from pyspark.ml.feature import Tokenizer

sentences_df = spark.createDataFrame([
    (1, 'This is an introduction to Spark MLlib'),
    (2, 'MLlib includes libraries for classification and regression'),
    (3, 'It also contains supporting tools for pipelines')], ['id', 'sentence'])
sentences_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  1|This is an introd...|
|  2|MLlib includes li...|
|  3|It also contains ...|
+---+--------------------+



In [19]:
sent_token = Tokenizer(inputCol='sentence', outputCol='words')
sent_tokenized_df = sent_token.transform(sentences_df)
sent_tokenized_df.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  1|This is an introd...|[this, is, an, in...|
|  2|MLlib includes li...|[mllib, includes,...|
|  3|It also contains ...|[it, also, contai...|
+---+--------------------+--------------------+



In [20]:
from pyspark.ml.feature import HashingTF, IDF

sentences_df

DataFrame[id: bigint, sentence: string]

In [21]:
sentences_df.take(1)

[Row(id=1, sentence='This is an introduction to Spark MLlib')]

In [22]:
sent_tokenized_df.take(1)

[Row(id=1, sentence='This is an introduction to Spark MLlib', words=['this', 'is', 'an', 'introduction', 'to', 'spark', 'mllib'])]

In [23]:
hashingTF = HashingTF(inputCol = 'words', outputCol = 'rawFeatures', numFeatures = 20)
sent_hfTF_df = hashingTF.transform(sent_tokenized_df)
sent_hfTF_df.show()

+---+--------------------+--------------------+--------------------+
| id|            sentence|               words|         rawFeatures|
+---+--------------------+--------------------+--------------------+
|  1|This is an introd...|[this, is, an, in...|(20,[1,5,6,8,12,1...|
|  2|MLlib includes li...|[mllib, includes,...|(20,[1,6,9,12,13,...|
|  3|It also contains ...|[it, also, contai...|(20,[0,8,10,12,15...|
+---+--------------------+--------------------+--------------------+



In [24]:
idf = IDF(inputCol = 'rawFeatures', outputCol = 'idf_features')
idfModel = idf.fit(sent_hfTF_df)
tfidf_df = idfModel.transform(sent_hfTF_df)
tfidf_df.show()

+---+--------------------+--------------------+--------------------+--------------------+
| id|            sentence|               words|         rawFeatures|        idf_features|
+---+--------------------+--------------------+--------------------+--------------------+
|  1|This is an introd...|[this, is, an, in...|(20,[1,5,6,8,12,1...|(20,[1,5,6,8,12,1...|
|  2|MLlib includes li...|[mllib, includes,...|(20,[1,6,9,12,13,...|(20,[1,6,9,12,13,...|
|  3|It also contains ...|[it, also, contai...|(20,[0,8,10,12,15...|(20,[0,8,10,12,15...|
+---+--------------------+--------------------+--------------------+--------------------+



In [25]:
tfidf_df.take(1)

[Row(id=1, sentence='This is an introduction to Spark MLlib', words=['this', 'is', 'an', 'introduction', 'to', 'spark', 'mllib'], rawFeatures=SparseVector(20, {1: 2.0, 5: 1.0, 6: 1.0, 8: 1.0, 12: 1.0, 13: 1.0}), idf_features=SparseVector(20, {1: 0.5754, 5: 0.6931, 6: 0.2877, 8: 0.2877, 12: 0.0, 13: 0.2877}))]

In [26]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

cluster_df = spark.read.csv('clustering_dataset.csv', header=True, inferSchema = True)
cluster_df

DataFrame[col1: int, col2: int, col3: int]

In [27]:
cluster_df.show(75)

+----+----+----+
|col1|col2|col3|
+----+----+----+
|   7|   4|   1|
|   7|   7|   9|
|   7|   9|   6|
|   1|   6|   5|
|   6|   7|   7|
|   7|   9|   4|
|   7|  10|   6|
|   7|   8|   2|
|   8|   3|   8|
|   4|  10|   5|
|   7|   4|   5|
|   7|   8|   4|
|   2|   5|   1|
|   2|   6|   2|
|   2|   3|   8|
|   3|   9|   1|
|   4|   2|   9|
|   1|   7|   1|
|   6|   2|   3|
|   4|   1|   9|
|   4|   8|   5|
|   6|   6|   7|
|   4|   6|   2|
|   8|   1|   1|
|   7|   5|  10|
|  17|  25|  21|
|  15|  23|  32|
|  42|  25|  45|
|  41|  47|  21|
|  37|  20|  27|
|  40|  18|  26|
|  41|  28|  50|
|  32|  25|  40|
|  24|  29|  35|
|  47|  18|  47|
|  36|  42|  45|
|  49|  29|  15|
|  47|  39|  22|
|  38|  27|  25|
|  45|  23|  40|
|  23|  36|  19|
|  47|  40|  50|
|  37|  30|  40|
|  42|  48|  41|
|  29|  31|  21|
|  36|  39|  48|
|  50|  24|  31|
|  42|  44|  37|
|  37|  39|  46|
|  22|  40|  30|
|  17|  29|  41|
|  85| 100|  69|
|  68|  76|  67|
|  76|  70|  93|
|  62|  66|  91|
|  83|  93|  7

In [29]:
vectorAssembler = VectorAssembler(inputCols = ['col1', 'col2', 'col3'], outputCol = 'features')
vcluster_df = vectorAssembler.transform(cluster_df)
vcluster_df.show()

+----+----+----+--------------+
|col1|col2|col3|      features|
+----+----+----+--------------+
|   7|   4|   1| [7.0,4.0,1.0]|
|   7|   7|   9| [7.0,7.0,9.0]|
|   7|   9|   6| [7.0,9.0,6.0]|
|   1|   6|   5| [1.0,6.0,5.0]|
|   6|   7|   7| [6.0,7.0,7.0]|
|   7|   9|   4| [7.0,9.0,4.0]|
|   7|  10|   6|[7.0,10.0,6.0]|
|   7|   8|   2| [7.0,8.0,2.0]|
|   8|   3|   8| [8.0,3.0,8.0]|
|   4|  10|   5|[4.0,10.0,5.0]|
|   7|   4|   5| [7.0,4.0,5.0]|
|   7|   8|   4| [7.0,8.0,4.0]|
|   2|   5|   1| [2.0,5.0,1.0]|
|   2|   6|   2| [2.0,6.0,2.0]|
|   2|   3|   8| [2.0,3.0,8.0]|
|   3|   9|   1| [3.0,9.0,1.0]|
|   4|   2|   9| [4.0,2.0,9.0]|
|   1|   7|   1| [1.0,7.0,1.0]|
|   6|   2|   3| [6.0,2.0,3.0]|
|   4|   1|   9| [4.0,1.0,9.0]|
+----+----+----+--------------+
only showing top 20 rows



In [30]:
kmeans = KMeans().setK(3)
kmeans = kmeans.setSeed(1)
kmodel = kmeans.fit(vcluster_df)

In [31]:
centers = kmodel.clusterCenters()
centers

[array([35.88461538, 31.46153846, 34.42307692]),
 array([5.12, 5.84, 4.84]),
 array([80.        , 79.20833333, 78.29166667])]

In [32]:
from pyspark.ml.clustering import BisectingKMeans
bkmeans = BisectingKMeans().setK(3)
bkmeans = bkmeans.setSeed(1)
bkmodel = bkmeans.fit(vcluster_df)
bkcenters = bkmodel.clusterCenters()
bkcenters

[array([5.12, 5.84, 4.84]),
 array([35.88461538, 31.46153846, 34.42307692]),
 array([80.        , 79.20833333, 78.29166667])]

In [33]:
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer

iris_df = spark.read.csv('iris.data.txt', inferSchema=True)
iris_df.take(1)

[Row(_c0=5.1, _c1=3.5, _c2=1.4, _c3=0.2, _c4='Iris-setosa')]

In [34]:
iris_df = iris_df.select(col('_c0').alias('sepal_length'), 
                        col('_c1').alias('sepal_width'),
                        col('_c2').alias('petal_length'), 
                        col('_c3').alias('petal_width'), 
                        col('_c4').alias('species')
                        )
iris_df.take(1)                 

[Row(sepal_length=5.1, sepal_width=3.5, petal_length=1.4, petal_width=0.2, species='Iris-setosa')]

In [35]:
vectorAssembler = VectorAssembler(inputCols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], outputCol='features')

viris_df = vectorAssembler.transform(iris_df)
viris_df.take(1)

[Row(sepal_length=5.1, sepal_width=3.5, petal_length=1.4, petal_width=0.2, species='Iris-setosa', features=DenseVector([5.1, 3.5, 1.4, 0.2]))]

In [36]:
indexer = StringIndexer(inputCol = 'species', outputCol='label')
iviris_df = indexer.fit(viris_df).transform(viris_df)
iviris_df.show(2)

+------------+-----------+------------+-----------+-----------+-----------------+-----+
|sepal_length|sepal_width|petal_length|petal_width|    species|         features|label|
+------------+-----------+------------+-----------+-----------+-----------------+-----+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|  0.0|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|  0.0|
+------------+-----------+------------+-----------+-----------+-----------------+-----+
only showing top 2 rows



In [37]:
iviris_df

DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string, features: vector, label: double]

In [38]:
iviris_df.take(1)

[Row(sepal_length=5.1, sepal_width=3.5, petal_length=1.4, petal_width=0.2, species='Iris-setosa', features=DenseVector([5.1, 3.5, 1.4, 0.2]), label=0.0)]

In [39]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

splits = iviris_df.randomSplit([0.6, 0.4], 1)

In [40]:
train_df = splits[0]
test_df = splits[1]

In [41]:
train_df.count()

92

In [42]:
test_df.count()

58

In [43]:
nb = NaiveBayes(modelType='multinomial')
nbmodel = nb.fit(train_df)

predictions_df = nbmodel.transform(test_df)

In [44]:
predictions_df.take(1)

[Row(sepal_length=4.5, sepal_width=2.3, petal_length=1.3, petal_width=0.3, species='Iris-setosa', features=DenseVector([4.5, 2.3, 1.3, 0.3]), label=0.0, rawPrediction=DenseVector([-10.3605, -11.0141, -11.7112]), probability=DenseVector([0.562, 0.2924, 0.1456]), prediction=0.0)]

In [45]:
evaluator = MulticlassClassificationEvaluator(labelCol = 'label', predictionCol='prediction', metricName='accuracy')
nbaccuracy = evaluator.evaluate(predictions_df)
nbaccuracy

0.5862068965517241

In [46]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

layers = [4, 5, 5, 3]
mlp = MultilayerPerceptronClassifier(layers=layers, seed=1)
mlp_model = mlp.fit(train_df)
mlp_predictions = mlp_model.transform(test_df)
mlp_evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
mlp_accuracy = mlp_evaluator.evaluate(mlp_predictions)
mlp_accuracy

0.9482758620689655

In [48]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(labelCol='label', featuresCol='features')
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
dt_accuracy = dt_evaluator.evaluate(dt_predictions)
dt_accuracy

0.9310344827586207

In [51]:
from pyspark.ml.regression import LinearRegression
pp_df = spark.read.csv('power_plant.csv', header=True, inferSchema=True)
pp_df.show()

+-----+-----+-------+-----+------+
|   AT|    V|     AP|   RH|    PE|
+-----+-----+-------+-----+------+
|14.96|41.76|1024.07|73.17|463.26|
|25.18|62.96|1020.04|59.08|444.37|
| 5.11| 39.4|1012.16|92.14|488.56|
|20.86|57.32|1010.24|76.64|446.48|
|10.82| 37.5|1009.23|96.62| 473.9|
|26.27|59.44|1012.23|58.77|443.67|
|15.89|43.96|1014.02|75.24|467.35|
| 9.48|44.71|1019.12|66.43|478.42|
|14.64| 45.0|1021.78|41.25|475.98|
|11.74|43.56|1015.14|70.72| 477.5|
|17.99|43.72|1008.64|75.04|453.02|
|20.14|46.93|1014.66|64.22|453.99|
|24.34| 73.5|1011.31|84.15|440.29|
|25.71|58.59|1012.77|61.83|451.28|
|26.19|69.34|1009.48|87.59|433.99|
|21.42|43.79|1015.76|43.08|462.19|
|18.21| 45.0|1022.86|48.84|467.54|
|11.04|41.74| 1022.6|77.51| 477.2|
|14.45|52.75|1023.97|63.59|459.85|
|13.97|38.47|1015.15|55.28| 464.3|
+-----+-----+-------+-----+------+
only showing top 20 rows



In [52]:
pp_df

DataFrame[AT: double, V: double, AP: double, RH: double, PE: double]

In [53]:
vectorAssembler = VectorAssembler(inputCols = ['AT', 'V', 'AP', 'RH'], outputCol = 'features')
vpp_df = vectorAssembler.transform(pp_df)
vpp_df.take(1)

[Row(AT=14.96, V=41.76, AP=1024.07, RH=73.17, PE=463.26, features=DenseVector([14.96, 41.76, 1024.07, 73.17]))]

In [54]:
vpp_df.show()

+-----+-----+-------+-----+------+--------------------+
|   AT|    V|     AP|   RH|    PE|            features|
+-----+-----+-------+-----+------+--------------------+
|14.96|41.76|1024.07|73.17|463.26|[14.96,41.76,1024...|
|25.18|62.96|1020.04|59.08|444.37|[25.18,62.96,1020...|
| 5.11| 39.4|1012.16|92.14|488.56|[5.11,39.4,1012.1...|
|20.86|57.32|1010.24|76.64|446.48|[20.86,57.32,1010...|
|10.82| 37.5|1009.23|96.62| 473.9|[10.82,37.5,1009....|
|26.27|59.44|1012.23|58.77|443.67|[26.27,59.44,1012...|
|15.89|43.96|1014.02|75.24|467.35|[15.89,43.96,1014...|
| 9.48|44.71|1019.12|66.43|478.42|[9.48,44.71,1019....|
|14.64| 45.0|1021.78|41.25|475.98|[14.64,45.0,1021....|
|11.74|43.56|1015.14|70.72| 477.5|[11.74,43.56,1015...|
|17.99|43.72|1008.64|75.04|453.02|[17.99,43.72,1008...|
|20.14|46.93|1014.66|64.22|453.99|[20.14,46.93,1014...|
|24.34| 73.5|1011.31|84.15|440.29|[24.34,73.5,1011....|
|25.71|58.59|1012.77|61.83|451.28|[25.71,58.59,1012...|
|26.19|69.34|1009.48|87.59|433.99|[26.19,69.34,1

In [1]:
lr = LinearRegression(featuresCol = 'features', labelCol='PE')
lr_model = lr.fit(vpp_df)

NameError: name 'LinearRegression' is not defined

In [56]:
lr_model.intercept

454.6092744523414

In [57]:
lr_model.summary.rootMeanSquaredError

4.557126016749488

In [58]:
lr_model.save('lr1.model')

In [60]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator

pp_df = spark.read.csv('power_plant.csv', header=True, inferSchema=True)
pp_df.take(1)

[Row(AT=14.96, V=41.76, AP=1024.07, RH=73.17, PE=463.26)]

In [62]:
vectorAssembler = VectorAssembler(inputCols = ['AT', 'V', 'AP', 'RH'], outputCol = 'features')
vpp_df = vectorAssembler.transform(pp_df)
vpp_df.take(1)

[Row(AT=14.96, V=41.76, AP=1024.07, RH=73.17, PE=463.26, features=DenseVector([14.96, 41.76, 1024.07, 73.17]))]

In [63]:
splits = vpp_df.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

In [67]:
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'PE')
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_evaluator = RegressionEvaluator(labelCol = 'PE', predictionCol = 'prediction', metricName = 'rmse')
rmse = dt_evaluator.evaluate(dt_predictions)
rmse

4.571310350538227

In [68]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'PE')
gbt_model = gbt.fit(train_df)
gbt_predictions = gbt_model.transform(test_df)
gbt_evaluator = RegressionEvaluator(labelCol = 'PE', predictionCol = 'prediction', metricName = 'rmse')
rmse = gbt_evaluator.evaluate(gbt_predictions)
rmse

4.137733340128398