In [5]:
# Import the SparkSession class
from pyspark.sql import SparkSession

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import round

In [2]:
# Create SparkSession object
spark = SparkSession.builder.master("local[*]").appName("test").getOrCreate()

In [3]:
# Convert categorical strings to index values
indexer = StringIndexer(inputCol='org', outputCol='org_idx')

# One-hot encode index values
onehot = OneHotEncoder(
    inputCols=['org_idx', 'dow'],
    outputCols=['org_dummy', 'dow_dummy']
)

# Assemble predictors into a single column
assembler = VectorAssembler(inputCols=['km', 'org_dummy', 'dow_dummy'], outputCol='features')

# A linear regression object
regression = LinearRegression(labelCol='duration')

In [6]:
flights = spark.read.csv(
    "data/flights.csv", sep=",", header=True, inferSchema=True, nullValue="NA"
)

flights = flights.withColumn('km', round(flights.mile * 1.60934, 0)) \
                    .drop('mile')

In [7]:
# Split into training and testing sets in a 80:20 ratio
flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=43)

In [8]:
# Import class for creating a pipeline
from pyspark.ml import Pipeline

# Construct a pipeline
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])

# Train the pipeline on the training data
pipeline = pipeline.fit(flights_train)

# Make predictions on the testing data
predictions = pipeline.transform(flights_test)

In [10]:
sms = spark.read.csv(
    "data/sms.csv", sep=";", header=False, inferSchema=True, nullValue="NA"
)
sms = sms.withColumnRenamed("_c0", "id").withColumnRenamed("_c1", "text").withColumnRenamed("_c2", "label")
sms.show(5)

+---+--------------------+-----+
| id|                text|label|
+---+--------------------+-----+
|  1|Sorry, I'll call ...|    0|
|  2|Dont worry. I gue...|    0|
|  3|Call FREEPHONE 08...|    1|
|  4|Win a 1000 cash p...|    1|
|  5|Go until jurong p...|    0|
+---+--------------------+-----+
only showing top 5 rows



In [11]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression

# Break text into tokens at non-word characters
tokenizer = Tokenizer(inputCol='text', outputCol='words')

# Remove stop words
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms')

# Apply the hashing trick and transform to TF-IDF
hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash")
idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features")

# Create a logistic regression object and add everything to a pipeline
logistic = LogisticRegression()
pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])

In [13]:

pipeline = Pipeline(stages=[indexer, onehot, assembler])

# Train the pipeline on the training data
pipeline = pipeline.fit(flights_train)

flights_train2 = pipeline.transform(flights_train)

In [14]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression


# Create an empty parameter grid
params = ParamGridBuilder().build()

# Create objects for building and evaluating a regression model
regression = LinearRegression(labelCol='duration')
evaluator = RegressionEvaluator(labelCol='duration')

# Create a cross validator
cv = CrossValidator(estimator=regression, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)

# Train and test model on multiple folds of the training data
cv = cv.fit(flights_train2)

# NOTE: Since cross-validation builds multiple models, the fit() method can take a little while to complete.

In [16]:
# Create an indexer for the org field
indexer = StringIndexer(inputCol='org', outputCol='org_idx')

# Create an one-hot encoder for the indexed org field
onehot = OneHotEncoder(inputCols=['org_idx'], outputCols=['org_dummy'])

# Assemble the km and one-hot encoded fields
assembler = VectorAssembler(inputCols=['km', 'org_dummy'], outputCol='features')

# Create a pipeline and cross-validator.
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])
cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=params,
                    evaluator=evaluator)

In [17]:
# Create parameter grid
params = ParamGridBuilder()

# Add grids for two parameters
params = params.addGrid(regression.regParam, [0.01, 0.1, 1.0, 10.0]) \
               .addGrid(regression.elasticNetParam, [0.0, 0.5, 1.0])

# Build the parameter grid
params = params.build()
print('Number of models to be tested: ', len(params))

# Create cross-validator
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)

Number of models to be tested:  12


In [22]:
cv = cv.fit(flights_train)

In [23]:
# Get the best model from cross validation
best_model = cv.bestModel

# Look at the stages in the best model
print(best_model.stages)

# Get the parameters for the LinearRegression object in the best model
best_model.stages[3].extractParamMap()

# Generate predictions on testing data using the best model then calculate RMSE
predictions = best_model.transform(flights_test)
print("RMSE =", evaluator.evaluate(predictions))

[StringIndexerModel: uid=StringIndexer_3eba30282d0d, handleInvalid=error, OneHotEncoderModel: uid=OneHotEncoder_d24dbf0f5a17, dropLast=true, handleInvalid=error, numInputCols=1, numOutputCols=1, VectorAssembler_0003a1636b21, LinearRegressionModel: uid=LinearRegression_dd4c2f823978, numFeatures=8]
RMSE = 11.120031838176322


In [31]:
# Import the classes required
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# # Create model objects and train on training data
# tree = DecisionTreeClassifier().fit(flights_train2)
# gbt = GBTClassifier().fit(flights_train2)

# # Compare AUC on testing data
# evaluator = BinaryClassificationEvaluator()
# print(evaluator.evaluate(tree.transform(flights_test)))
# print(evaluator.evaluate(gbt.transform(flights_test)))

# # Find the number of trees and the relative importance of features
# print(gbt.getNumTrees)
# print(gbt.featureImportances)

In [32]:
# Create a random forest classifier
forest = RandomForestClassifier()

# Create a parameter grid
params = ParamGridBuilder() \
            .addGrid(forest.featureSubsetStrategy, ['all', 'onethird', 'sqrt', 'log2']) \
            .addGrid(forest.maxDepth, [2, 5, 10]) \
            .build()

# Create a binary classification evaluator
evaluator = BinaryClassificationEvaluator()

# Create a cross-validator
cv = CrossValidator(estimator=forest, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)

In [34]:
# # Average AUC for each parameter combination in grid
# print(cv.avgMetrics)

# # Average AUC for the best model
# print(max(cv.avgMetrics))

# # What's the optimal parameter value for maxDepth?
# print(cv.bestModel.explainParam('maxDepth'))
# # What's the optimal parameter value for featureSubsetStrategy?
# print(cv.bestModel.explainParam('featureSubsetStrategy'))

# # AUC for best model on testing data
# print(evaluator.evaluate(cv.transform(flights_test)))