# PySpark Parallel

### Source
http://alumni.soe.ucsc.edu/~bweber/PySpark_Parallel.html

### Part 0: Single Threaded

In [2]:
import numpy as np
import pandas as pd

# Load the Boston data set
from sklearn.datasets import load_boston
boston = load_boston()

# Convert to a pandas DataFrame
df_boston = pd.DataFrame(data=np.c_[boston['data'], boston['target']],
                         columns=np.append(boston['feature_names'], "target")).sample(frac=1)

In [6]:
boston

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
        35.4, 24.7, 3

In [7]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [8]:
boston['feature_names']

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [3]:
df_boston.shape

(506, 14)

In [5]:
df_boston.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
464,7.83932,0.0,18.1,0.0,0.655,6.209,65.4,2.9634,24.0,666.0,20.2,396.9,13.22,21.4
454,9.51363,0.0,18.1,0.0,0.713,6.728,94.1,2.4961,24.0,666.0,20.2,6.68,18.71,14.9
136,0.32264,0.0,21.89,0.0,0.624,5.942,93.5,1.9669,4.0,437.0,21.2,378.25,16.9,17.4
301,0.03537,34.0,6.09,0.0,0.433,6.59,40.4,5.4917,7.0,329.0,16.1,395.75,9.5,22.0
371,9.2323,0.0,18.1,0.0,0.631,6.216,100.0,1.1691,24.0,666.0,20.2,366.15,9.53,50.0


In [9]:
from sklearn.linear_model import LinearRegression
from scipy.stats.stats import pearsonr

# Split into data and label arrays
y = df_boston['target']
X = df_boston.drop(['target'], axis=1)

# Create training (~80%) and test data sets
X_train = X[:400]
X_test = X[400:]
y_train = y[:400]
y_test = y[400:]

# Train a classifier
lr = LinearRegression()
model = lr.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Error metrics
r = pearsonr(y_pred, y_test)
mae = sum(abs(y_pred - y_test)) / len(y_test)
print("R-squared: " + str(r[0]**2))
print("MAE: " + str(mae))

R-squared: 0.7978543217908746
MAE: 3.0675595667467013


### Part 1: Native Spark

In [13]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [14]:
# Convert to a Spark DataFrame
df_sp_boston = spark.createDataFrame(df_boston)
display(df_sp_boston)

DataFrame[CRIM: double, ZN: double, INDUS: double, CHAS: double, NOX: double, RM: double, AGE: double, DIS: double, RAD: double, TAX: double, PTRATIO: double, B: double, LSTAT: double, target: double]

In [15]:
from pyspark.ml.feature import VectorAssembler

# Split into training and test spark DataFrames
df_sp_boston_train = spark.createDataFrame(df_boston[:400])
df_sp_boston_test = spark.createDataFrame(df_boston[400:])

# Convert to vector representation for MLlib
assembler = VectorAssembler(inputCols = df_sp_boston_train.schema.names[:(df_boston.shape[1] - 1)],
                            outputCol="features")
df_sp_boston_train = assembler.transform(df_sp_boston_train).select("features", "target")
df_sp_boston_test = assembler.transform(df_sp_boston_test).select("features", "target")

display(df_sp_boston_train)

DataFrame[features: vector, target: double]

In [16]:
# Linear regression with Spark
from pyspark.ml.regression import LinearRegression

# Linear regression
lr = LinearRegression(maxIter=10,
                      regParam=0.1,
                      elasticNetParam=0.5,
                      labelCol="target")

# Fit the model
model = lr.fit(df_sp_boston_train)
boston_pred = model.transform(df_sp_boston_test)

# Calculate results
r = boston_pred.stat.corr("prediction", "target")
print("R-squared: " + str(r**2))

R-squared: 0.7911516559429618


In [17]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

crossval = CrossValidator(estimator=LinearRegression(labelCol="target"),
                          estimatorParamMaps=ParamGridBuilder().addGrid(LinearRegression.elasticNetParam,
                                                                        [0, 0.5, 1.0]).build(),
                          evaluator=RegressionEvaluator(labelCol="target", metricName="r2"),
                          numFolds=10)

# Cross validate the model and select the best fit
cvModel = crossval.fit(df_sp_boston_train)
model = cvModel.bestModel

# Calculate results
boston_pred = model.transform(df_sp_boston_test)
r = boston_pred.stat.corr("prediction", "target")
print("R-squared: " + str(r**2))

R-squared: 0.797854321790873


### Part 2: Thread Pools

In [19]:
# sklearn version
from sklearn.ensemble import RandomForestRegressor as RFR
from multiprocessing.pool import ThreadPool

# Allow up to 5 concurrent threads
pool = ThreadPool(5)

# Hyperparameters to test out (n_trees)
parameters = [10, 20, 50]

# Define a function to train a RF model and return metrics
def sklearn_random_forest(trees, X_train, X_test, y_train, y_test):
    # Train a random forest regressor with the specified number of trees
    rf = RFR(n_estimators = trees)
    model = rf.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    r = pearsonr(y_pred, y_test)
    
    # Return the number of trees, and the R value
    return [trees, r[0]**2]

# Run the tasks
pool.map(lambda trees: sklearn_random_forest(trees, X_train, X_test, y_train, y_test), parameters)

[[10, 0.8964116769881151], [20, 0.8986075120533545], [50, 0.9074035549290438]]

In [24]:
# Spark version
from pyspark.ml.regression import RandomForestRegressor

# Define a function to train a RF model and return metrics
def mllib_random_forest(trees, boston_train, boston_test):
    # Train a random forest regressor with the specified number of trees
    rf = RandomForestRegressor(numTrees=trees, labelCol="target")
    model = rf.fit(boston_train)
    
    # Make predictions
    boston_pred = model.transform(boston_test)
    r = boston_pred.stat.corr("prediction", "target")
    
    # Return the number of trees, and the R value
    return [trees, r**2]

# Run the task
pool.map(lambda trees: mllib_random_forest(trees, df_sp_boston_train, df_sp_boston_test), parameters)

[[10, 0.8843817460310426], [20, 0.8887530619275893], [50, 0.8896858789209846]]

### Part 3: Pandas UDF

In [43]:
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import *

# Setup the spark DataFrame as a table
df_sp_boston.createOrReplaceTempView("boston")

# Add train/test label and expand the data set by 3x (each num trees parameter)
full_df = spark.sql("""
  select *
  from (
    select *, case when rand() < 0.8 then 1 else 0 end as training 
    from boston
  ) b
  cross join (
      select 11 as trees union all select 20 as trees union all select 50 as trees)
""")

schema = StructType([StructField('trees', LongType(), True),
                     StructField('r_squared', DoubleType(), True)])  

@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def train_RF(boston_pd):
    trees = boston_pd['trees'].unique()[0]
    
    # Get the train and test groups
    boston_train = boston_pd[boston_pd['training'] == 1]
    boston_test = boston_pd[boston_pd['training'] == 0] 
    
    # Create data and label groups
    y_train = boston_train['target']
    X_train = boston_train.drop(['target'], axis=1)
    y_test = boston_test['target']
    X_test = boston_test.drop(['target'], axis=1)
    
    # Train a classifier
    rf = RFR(n_estimators = trees)
    model = rf.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    r = pearsonr(y_pred, y_test)
    
    # Return the number of trees, and the R value
    return pd.DataFrame({'trees': trees, 'r_squared': (r[0]**2)}, index=[0])

# Use the Pandas UDF
results = full_df.groupby('trees').apply(train_RF)

# print the results
print(results.take(3))

[Row(trees=20, r_squared=0.8243066106455859), Row(trees=50, r_squared=0.8437864365602046), Row(trees=11, r_squared=0.8243721518322042)]
