# Linear regression example with PySpark MLlib

In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession
sc = SparkContext("local")
spark = SparkSession.builder.getOrCreate()

Load the netflix-subscription dataset from the data folder. The dataset represents the characteristics of Netflix subscribers per country.

In [2]:
import numpy as np
from pyspark.sql import functions as F
tFile="data\\netflix-subscription.csv"
data = spark.read.csv(tFile,header=True, inferSchema=True)
data.toPandas()

Unnamed: 0,Country_Code,Country,Total_Library_Size,Num_TV_Shows,Num_Movies,Cost_Per_Month_Basic,Cost_Per_Month_Standard,Cost_Per_Month_Premium
0,ar,Argentina,4760,3154,1606,3.74,6.30,9.26
1,au,Australia,6114,4050,2064,7.84,12.12,16.39
2,at,Austria,5640,3779,1861,9.03,14.67,20.32
3,be,Belgium,4990,3374,1616,10.16,15.24,20.32
4,bo,Bolivia,4991,3155,1836,7.99,10.99,13.99
...,...,...,...,...,...,...,...,...
60,ua,Ukraine,5336,3261,2075,5.64,8.46,11.29
61,gb,United Kingdom,6643,4551,2092,7.91,13.20,18.48
62,us,United States,5818,3826,1992,8.99,13.99,17.99
63,uy,Uruguay,4989,3154,1835,8.99,12.99,15.99


# Convert Data into features

In [3]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["Total_Library_Size", "Num_TV_Shows", "Num_Movies"],
    outputCol="features")

data = assembler.transform(data)
data.toPandas().head(5)

Unnamed: 0,Country_Code,Country,Total_Library_Size,Num_TV_Shows,Num_Movies,Cost_Per_Month_Basic,Cost_Per_Month_Standard,Cost_Per_Month_Premium,features
0,ar,Argentina,4760,3154,1606,3.74,6.3,9.26,"[4760.0, 3154.0, 1606.0]"
1,au,Australia,6114,4050,2064,7.84,12.12,16.39,"[6114.0, 4050.0, 2064.0]"
2,at,Austria,5640,3779,1861,9.03,14.67,20.32,"[5640.0, 3779.0, 1861.0]"
3,be,Belgium,4990,3374,1616,10.16,15.24,20.32,"[4990.0, 3374.0, 1616.0]"
4,bo,Bolivia,4991,3155,1836,7.99,10.99,13.99,"[4991.0, 3155.0, 1836.0]"


In [4]:
# split the data into training and test sets
train_data, test_data = data.randomSplit([0.7, 0.3], seed=10)

# Create Linear Regression Model

In [5]:
from pyspark.ml.regression import LinearRegression

# create a LinearRegression object and fit the model to the training data
lr = LinearRegression(featuresCol='features', labelCol='Cost_Per_Month_Premium', maxIter=10, regParam=0.1)
model = lr.fit(train_data)

# make predictions on the test data
predictions = model.transform(test_data)

# print the coefficients and intercept of the linear regression model
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))
predictions.toPandas().head(5)

Coefficients: [4.181061449333389e-05,0.0005929975095583083,-0.002823846451108542]
Intercept: 18.030308206811153


Unnamed: 0,Country_Code,Country,Total_Library_Size,Num_TV_Shows,Num_Movies,Cost_Per_Month_Basic,Cost_Per_Month_Standard,Cost_Per_Month_Premium,features,prediction
0,at,Austria,5640,3779,1861,9.03,14.67,20.32,"[5640.0, 3779.0, 1861.0]",15.251879
1,be,Belgium,4990,3374,1616,10.16,15.24,20.32,"[4990.0, 3374.0, 1616.0]",15.676381
2,br,Brazil,4972,3162,1810,4.61,7.11,9.96,"[4972.0, 3162.0, 1810.0]",15.002087
3,ca,Canada,6239,4311,1928,7.91,11.87,15.03,"[6239.0, 4311.0, 1928.0]",15.403201
4,ch,Switzerland,5506,3654,1852,12.88,20.46,26.96,"[5506.0, 3654.0, 1852.0]",15.197567


In [6]:
# evaluate the model by calculating the root mean squared error (RMSE)
from pyspark.ml.evaluation import RegressionEvaluator
evaluator1 = RegressionEvaluator(labelCol='Cost_Per_Month_Premium', predictionCol='prediction', metricName='rmse')
evaluator2 = RegressionEvaluator(labelCol='Cost_Per_Month_Premium', predictionCol='prediction', metricName='r2')
rmse = evaluator1.evaluate(predictions)
r2 = evaluator2.evaluate(predictions)

print(f"Root Mean Squared Error (RMSE) = {rmse}, R2 = {r2}")

Root Mean Squared Error (RMSE) = 4.239163817292762, R2 = 0.01479189502408551


# Use Non-linear model for regressions

In [7]:
from pyspark.ml.regression import GBTRegressor

gbt = GBTRegressor(featuresCol='features', labelCol='Cost_Per_Month_Premium', maxIter=3, maxDepth=2)
model = gbt.fit(train_data)

# make predictions on the test data
predictions = model.transform(test_data)

In [8]:
evaluator1 = RegressionEvaluator(labelCol='Cost_Per_Month_Premium', predictionCol='prediction', metricName='rmse')
evaluator2 = RegressionEvaluator(labelCol='Cost_Per_Month_Premium', predictionCol='prediction', metricName='r2')
rmse = evaluator1.evaluate(predictions)
r2 = evaluator2.evaluate(predictions)

print(f"Root Mean Squared Error (RMSE) = {rmse}, R2 = {r2}")

Root Mean Squared Error (RMSE) = 4.032765976445649, R2 = 0.10839269621987124
