In [2]:
import findspark
findspark.init()

In [5]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

In [4]:
spark = SparkSession.builder.appName("Practice Session").getOrCreate()

## Part 1: Linear Regression

##### Spark Documentation Model

In [6]:
dfTutorial = spark.read.format("libsvm").load("Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Linear_Regression/sample_linear_regression_data.txt")
modLinear = LinearRegression(featuresCol='features', labelCol='label', predictionCol='prediction')
modFit = modLinear.fit(dfTutorial)

In [18]:
modFit.coefficients

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [16]:
modFit.intercept

0.14228558260358093

In [19]:
sumTrainingSummary = modFit.summary

In [20]:
sumTrainingSummary.r2

0.027839179518600154

In [21]:
df_all = spark.read.format("libsvm").load("Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Linear_Regression/sample_linear_regression_data.txt")
train_data, test_data = df_all.randomSplit([.7, .3])
correct_model = modLinear.fit(train_data)

In [37]:
test_results = correct_model.evaluate(test_data)

In [34]:
unlabeled_data = test_data.select('features')

In [35]:
predictions = correct_model.transform(unlabeled_data)

In [36]:
predictions.show()

+--------------------+-------------------+
|            features|         prediction|
+--------------------+-------------------+
|(10,[0,1,2,3,4,5,...|-1.7943011606013728|
|(10,[0,1,2,3,4,5,...| -0.474180171536825|
|(10,[0,1,2,3,4,5,...|  -1.72228884358204|
|(10,[0,1,2,3,4,5,...|-0.6012011960692427|
|(10,[0,1,2,3,4,5,...|-2.0704794968284044|
|(10,[0,1,2,3,4,5,...| -1.290438454721807|
|(10,[0,1,2,3,4,5,...| -1.053403617300507|
|(10,[0,1,2,3,4,5,...| 0.8667509483373479|
|(10,[0,1,2,3,4,5,...| 2.7701250113771883|
|(10,[0,1,2,3,4,5,...| 1.3912949043305507|
|(10,[0,1,2,3,4,5,...| 1.8549144652984897|
|(10,[0,1,2,3,4,5,...|  2.247770027991773|
|(10,[0,1,2,3,4,5,...|-0.6832450516734077|
|(10,[0,1,2,3,4,5,...|-1.9601749450694204|
|(10,[0,1,2,3,4,5,...|-0.8612668985832878|
|(10,[0,1,2,3,4,5,...| 0.1718594899224636|
|(10,[0,1,2,3,4,5,...|-0.4011853386069817|
|(10,[0,1,2,3,4,5,...|-3.4203699823268114|
|(10,[0,1,2,3,4,5,...| 1.1344043402716903|
|(10,[0,1,2,3,4,5,...|-1.2804214701868104|
+----------

##### Code-Along Example

In [42]:
data = spark.read.csv('Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Linear_Regression/Ecommerce_Customers.csv', inferSchema=True, header=True)
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [41]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [44]:
assembler = VectorAssembler(inputCols=['Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership'], outputCol='features')

In [45]:
output = assembler.transform(data)

In [47]:
output.select('features').show()


+--------------------+
|            features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
|[33.0009147556426...|
|[34.3055566297555...|
|[33.3306725236463...|
|[33.8710378793419...|
|[32.0215955013870...|
|[32.7391429383803...|
|[33.9877728956856...|
|[31.9365486184489...|
|[33.9925727749537...|
|[33.8793608248049...|
|[29.5324289670579...|
|[33.1903340437226...|
|[32.3879758531538...|
|[30.7377203726281...|
|[32.1253868972878...|
|[32.3388993230671...|
|[32.1878120459321...|
|[32.6178560628234...|
+--------------------+
only showing top 20 rows



In [48]:
final_data = output.select('features', 'Yearly Amount Spent')

In [50]:
train_data, test_data = final_data.randomSplit([.7, .3])

In [52]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                356|
|   mean| 499.97929120032796|
| stddev|  82.75069304449161|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [53]:
lr = LinearRegression(featuresCol='features', labelCol='Yearly Amount Spent')

In [54]:
lr_model = lr.fit(train_data)

In [55]:
test_result = lr_model.evaluate(test_data)

In [60]:
test_result.r2

0.9814576455263323

In [61]:
unlabeled_data = test_data.select('features')

In [63]:
predictions = lr_model.transform(unlabeled_data)
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[31.0472221394875...|387.56597702271097|
|[31.2681042107507...|428.11446199300553|
|[31.5171218025062...| 279.7729086378197|
|[31.5261978982398...| 418.0850018077051|
|[31.5702008293202...| 564.5457390346467|
|[31.5761319713222...| 543.6109272682261|
|[31.7242025238451...|510.50938370737094|
|[31.7366356860502...|  494.504195820208|
|[31.8279790554652...| 449.7142906785955|
|[31.8512531286083...|465.43310035009904|
|[31.8530748017465...|  461.948657544822|
|[31.8648325480987...|  451.246804593628|
|[31.9120759292006...| 389.8727486456821|
|[31.9262720263601...|380.21694021072744|
|[31.9480174211613...| 457.1894439079233|
|[31.9563005605233...| 565.4963424447301|
|[32.0180740106320...|339.60199525259236|
|[32.0478146331398...| 481.6331436255873|
|[32.0498393904573...| 455.8872466273999|
|[32.0637746203136...| 389.5115938701904|
+--------------------+------------

##### Consulting Project (Hyundai Cruise Ships)

In [66]:
data = spark.read.csv('Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Linear_Regression/cruise_ship_info.csv', inferSchema=True, header=True)
data.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Elation|   Carnival| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Fantasy|   Carnival| 23| 

In [68]:
from pyspark.ml.feature import StringIndexer

In [74]:
indexer = StringIndexer(inputCol='Cruise_line', outputCol='cruise_cat')
indexed = indexer.fit(data).transform(data)
indexed.head(1)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_cat=16.0)]

In [75]:
assembler = VectorAssembler(inputCols=['cruise_cat', 'Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density'], outputCol='features')

In [77]:
output = assembler.transform(indexed)
output.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+----------+--------------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|cruise_cat|            features|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+----------+--------------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|      16.0|[16.0,6.0,30.2769...|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|      16.0|[16.0,6.0,30.2769...|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|       1.0|[1.0,26.0,47.262,...|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|       1.0|[1.0,11.0,110.0,2...|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0| 

In [78]:
final_data = output.select(['features', 'crew'])

In [82]:
train_data, test_data = final_data.randomSplit([.7,.3])
train_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              111|
|   mean|7.718558558558563|
| stddev|3.336715817808353|
|    min|              0.6|
|    max|             19.1|
+-------+-----------------+



In [83]:
ship_lr = LinearRegression(featuresCol='features', labelCol='crew')

In [84]:
trained_ship_model = ship_lr.fit(train_data)

In [85]:
ship_results = trained_ship_model.evaluate(test_data)

In [86]:
ship_results.r2

0.9308769357389736

In [89]:
from pyspark.sql.functions import corr
data.select(corr('crew', 'cabins')).show()

+------------------+
|corr(crew, cabins)|
+------------------+
|0.9508226063578497|
+------------------+



## Part 2: Logistic Regression

##### Spark Documentation Example

In [97]:
from pyspark.ml.classification import LogisticRegression

In [96]:
data = spark.read.format("libsvm").load('Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Logistic_Regression/sample_libsvm_data.txt')
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [98]:
my_model = LogisticRegression(featuresCol='features', labelCol='label')

In [99]:
fitted_model = my_model.fit(data)

In [103]:
log_summary = fitted_model.summary

In [104]:
log_summary.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[127,128,129...|[20.3777627514872...|[0.99999999858729...|       0.0|
|  1.0|(692,[158,159,160...|[-21.114014198868...|[6.76550380000472...|       1.0|
|  1.0|(692,[124,125,126...|[-23.743613234676...|[4.87842678716177...|       1.0|
|  1.0|(692,[152,153,154...|[-19.192574012720...|[4.62137287298144...|       1.0|
|  1.0|(692,[151,152,153...|[-20.125398874699...|[1.81823629113068...|       1.0|
|  0.0|(692,[129,130,131...|[20.4890549504196...|[0.99999999873608...|       0.0|
|  1.0|(692,[158,159,160...|[-21.082940212814...|[6.97903542823766...|       1.0|
|  1.0|(692,[99,100,101,...|[-19.622713503550...|[3.00582577446132...|       1.0|
|  0.0|(692,[154,155,156...|[21.1594863606582...|[0.99999999935352...|       0.0|
|  0.0|(692,[127

In [105]:
train_data, test_data = data.randomSplit([.7,.3])

In [107]:
final_model = LogisticRegression(featuresCol='features', labelCol='label')
final_fit = final_model.fit(train_data)

In [108]:
final_evaluation = final_fit.evaluate(test_data)

In [110]:
from pyspark.ml.evaluation import (BinaryClassificationEvaluator, MulticlassClassificationEvaluator)

In [111]:
my_eval = BinaryClassificationEvaluator(labelCol='label')

In [112]:
final_results = my_eval.evaluate(final_evaluation.predictions)

In [113]:
final_results

1.0

##### Code Along Example

In [114]:
data = spark.read.csv('Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Logistic_Regression/titanic.csv', header=True, inferSchema=True)

In [116]:
data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [117]:
cols = data.select(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])

In [118]:
final_data = cols.na.drop()

In [119]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer

In [121]:
gender_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVec')

In [122]:
embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkedIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkedIndex', outputCol='EmbarkedVec')

In [124]:
assembler = VectorAssembler(inputCols=['Pclass', 'SexVec', 'EmbarkedVec', 'Age', 'SibSp', 'Parch', 'Fare'], outputCol='features')

In [125]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [126]:
log_reg_titanic = LogisticRegression(featuresCol='features', labelCol='Survived')

In [127]:
pipeline = Pipeline(stages=[gender_indexer, embark_indexer, gender_encoder, embark_encoder, assembler, log_reg_titanic])

In [128]:
train_data, test_data = final_data.randomSplit([.7,.3])

In [129]:
fit_model = pipeline.fit(train_data)

In [130]:
results = fit_model.transform(test_data)

In [131]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [133]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived')

In [134]:
my_eval.evaluate(results)

0.7851745966918522

##### Consulting Example

In [135]:
data = spark.read.csv('Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Logistic_Regression/customer_churn.csv', inferSchema=True, header=True)

In [137]:
from pyspark.ml.feature import VectorAssembler

In [138]:
assembler = VectorAssembler(inputCols=['Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites'], outputCol='features')

In [139]:
output = assembler.transform(data)

In [140]:
final_data = output.select(['features', 'churn'])

In [141]:
train_data, test_data = final_data.randomSplit([.7, .3])

In [142]:
lr_churn = LogisticRegression(labelCol='churn')

In [143]:
fitted_model = lr_churn.fit(train_data)

In [144]:
training_sum = fitted_model.summary

In [145]:
training_sum.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|              churn|         prediction|
+-------+-------------------+-------------------+
|  count|                633|                633|
|   mean|0.16429699842022116|0.12480252764612954|
| stddev|0.37083789867731537|0.33075623054484365|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



In [146]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [155]:
pred_and_labels = fitted_model.evaluate(test_data)

In [148]:
evaluator = BinaryClassificationEvaluator(labelCol='churn', rawPredictionCol='prediction')

In [158]:
AUC = evaluator.evaluate(pred_and_labels.predictions)
AUC

0.7835431831595515

In [159]:
final_lr_model = lr_churn.fit(final_data)

In [160]:
new_customers = spark.read.csv('Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Logistic_Regression/new_customers.csv', header=True, inferSchema=True)

In [161]:
test_new_customers = assembler.transform(new_customers)

In [162]:
test_new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- features: vector (nullable = true)



In [163]:
final_results = final_lr_model.transform(test_new_customers)

In [164]:
final_results.show()

+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+--------------------+--------------------+--------------------+----------+
|         Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|         Company|            features|       rawPrediction|         probability|prediction|
+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+--------------------+--------------------+--------------------+----------+
| Andrew Mccall|37.0|       9935.53|              1| 7.71|      8.0|2011-08-29 18:37:54|38612 Johnny Stra...|        King Ltd|[37.0,9935.53,1.0...|[2.22168680572547...|[0.90218015921764...|       0.0|
|Michele Wright|23.0|       7526.94|              1| 9.28|     15.0|2013-07-22 18:19:54|21083 Nicole Junc...|   Cannon-Benson|[23.0,7526.94,1.0...|[-6.2207539991844...|[0.00198380259784...|       

## Part 3: Decision Trees and Random Forests

##### Documentation Example

In [165]:
data = spark.read.format("libsvm").load('Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Tree_Methods/sample_libsvm_data.txt')

In [167]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier

In [168]:
train_data, test_data = data.randomSplit([.7, .3])

In [169]:
dtc = DecisionTreeClassifier(featuresCol='features', labelCol='label')

In [170]:
rfc = RandomForestClassifier(featuresCol='features', labelCol='label')
gbt = GBTClassifier(featuresCol='features', labelCol='label')

In [171]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [173]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [174]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [181]:
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [184]:
acc_eval.evaluate(dtc_preds)

0.9130434782608695

In [185]:
rfc_model.featureImportances

SparseVector(692, {178: 0.0018, 211: 0.0006, 262: 0.0109, 272: 0.0011, 301: 0.0029, 323: 0.0389, 324: 0.0026, 330: 0.0359, 378: 0.0971, 384: 0.0368, 385: 0.0367, 405: 0.0567, 430: 0.0045, 433: 0.0667, 434: 0.05, 455: 0.0019, 484: 0.0433, 489: 0.0568, 490: 0.0528, 511: 0.0472, 512: 0.0443, 517: 0.0469, 537: 0.0067, 540: 0.1751, 549: 0.0031, 551: 0.0273, 568: 0.0431, 598: 0.0024, 620: 0.0035, 638: 0.0024})

##### Code Along Example

In [186]:
data = spark.read.csv('Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Tree_Methods/College.csv', inferSchema=True, header=True)

In [189]:
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [188]:
from pyspark.ml.feature import VectorAssembler

In [190]:
assembler = VectorAssembler(inputCols=['Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate'], outputCol='features')

In [192]:
output = assembler.transform(data)

In [193]:
from pyspark.ml.feature import StringIndexer

In [194]:
indexer = StringIndexer(inputCol='Private', outputCol='PrivateIndex')

In [195]:
output_fixed = indexer.fit(output).transform(output)

In [196]:
output_fixed.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)



In [197]:
final_data = output_fixed.select(['features', 'PrivateIndex'])

In [198]:
train_data, test_data = final_data.randomSplit([.7,.3])

In [199]:
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier

In [200]:
from pyspark.ml import Pipeline

In [201]:
dtc = DecisionTreeClassifier(labelCol='PrivateIndex')
rfc = RandomForestClassifier(labelCol='PrivateIndex')
gbt = GBTClassifier(labelCol='PrivateIndex')

In [202]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [203]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [204]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [205]:
bin_eval = BinaryClassificationEvaluator(labelCol='PrivateIndex')

In [209]:
bin_eval.evaluate(rfc_preds)

0.9869046464254535

In [211]:
bin_eval2 = BinaryClassificationEvaluator(labelCol='PrivateIndex', rawPredictionCol='prediction')

In [212]:
bin_eval2.evaluate(gbt_preds)

0.8996022892618101

In [213]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [215]:
acc_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex', metricName='accuracy')

In [216]:
rfc_acc = acc_eval.evaluate(rfc_preds)
print(rfc_acc)

0.9304347826086956


##### Consulting Example

In [217]:
data = spark.read.csv('Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Tree_Methods/dog_food.csv', inferSchema=True, header=True)

In [218]:
from pyspark.ml.feature import VectorAssembler

In [219]:
assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'], outputCol='features')

In [220]:
output = assembler.transform(data)

In [221]:
from pyspark.ml.classification import RandomForestClassifier

In [222]:
rfc = RandomForestClassifier(labelCol='Spoiled', featuresCol='features')

In [223]:
final_data = output.select(['features', 'Spoiled'])

In [225]:
rfc_model = rfc.fit(final_data)

In [226]:
rfc_model.featureImportances

SparseVector(4, {0: 0.0192, 1: 0.0166, 2: 0.9544, 3: 0.0098})

## Part 4: K-means Clustering

##### Documentation Example

In [227]:
data = spark.read.format("libsvm").load('Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Clustering/sample_kmeans_data.txt')

In [228]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [229]:
from pyspark.ml.clustering import KMeans

In [230]:
final_data = data.select('features')

In [232]:
kmeans = KMeans().setK(2).setSeed(1)

In [233]:
model = kmeans.fit(final_data)

In [235]:
centers = model.clusterCenters()

In [239]:
preds = model.transform(final_data)

In [240]:
preds.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         1|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+



##### Code Along Example

In [241]:
data = spark.read.csv('Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Clustering/seeds_dataset.csv', inferSchema=True, header=True)

In [242]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [249]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler

In [245]:
assembler = VectorAssembler(inputCols=data.columns, outputCol='features')

In [246]:
final_data = assembler.transform(data)

In [248]:
scaler = StandardScaler(inputCol='features', outputCol='ScaledFeatures')

In [250]:
scaler_model = scaler.fit(final_data)

In [251]:
final_data = scaler_model.transform(final_data)

In [253]:
kmeans = KMeans(featuresCol='ScaledFeatures', k=3)

In [254]:
model = kmeans.fit(final_data)

In [258]:
centers = model.clusterCenters()
centers

[array([ 4.06105916, 10.13979506, 35.80536984, 11.82133095,  7.50395937,
         3.27184732, 10.42126018]),
 array([ 4.87257659, 10.88120146, 37.27692543, 12.3410157 ,  8.55443412,
         1.81649011, 10.32998598]),
 array([ 6.31670546, 12.37109759, 37.39491396, 13.91155062,  9.748067  ,
         2.39849968, 12.2661748 ])]

In [260]:
model.transform(final_data).select('prediction').show()

+----------+
|prediction|
+----------+
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         2|
|         2|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         0|
+----------+
only showing top 20 rows



##### Consulting Project

In [261]:
data = spark.read.csv('Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Clustering/hack_data.csv', header=True, inferSchema=True)

In [262]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler

In [263]:
data.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [264]:
feat_cols = ['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'WPM_Typing_Speed']

In [265]:
assembler = VectorAssembler(inputCols=feat_cols, outputCol='features')

In [266]:
final_data = assembler.transform(data)

In [267]:
scaler = StandardScaler(inputCol='features', outputCol='ScaledFeatures')

In [268]:
scaler_model = scaler.fit(final_data)

In [269]:
final_data = scaler_model.transform(final_data)

In [273]:
kmeans2 = KMeans(featuresCol='ScaledFeatures', k=2)
kmeans3 = KMeans(featuresCol='features', k=3)

In [272]:
model_k2 = kmeans2.fit(final_data)
model_k3 = kmeans3.fit(final_data)

In [274]:
cluster_final_data = final_data

In [282]:
output_k3 = model_k3.transform(cluster_final_data)
output_k3.select('prediction').groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  124|
|         2|  118|
|         0|   92|
+----------+-----+



In [281]:
output_k2 = model_k2.transform(cluster_final_data)
output_k2.select('prediction').groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+

