In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
.master("local[4]") \
.appName("SalesPrediction") \
.config("spark.executor.memory", "1g") \
.config("spark.driver.memory", "1g") \
.getOrCreate()

In [None]:
adv = spark.read.format("csv") \
.option("header", True) \
.option("inferSchema", True) \
.option("sep", ",") \
.load("/home/alper/Spark/data/Advertising.csv")

In [None]:
adv.limit(5).toPandas().head()

In [None]:
adv.describe().toPandas()

In [None]:
from pyspark.ml.feature import VectorAssembler

vector_assembler = VectorAssembler() \
.setInputCols(["TV", "Radio", "Newspaper"]) \
.setOutputCol("features")

In [None]:
from pyspark.ml.regression import LinearRegression

lr_object = LinearRegression() \
.setFeaturesCol("features") \
.setLabelCol("Sales")

In [None]:
train_df, test_df = adv.randomSplit([0.8, 0.2], seed=4242)

In [None]:
from pyspark.ml import Pipeline

pipe = Pipeline() \
.setStages([vector_assembler, lr_object])

In [None]:
model = pipe.fit(train_df)

In [None]:
resultDF = model.transform(train_df)

In [None]:
resultDF.limit(5).toPandas().head()

In [None]:
model.stages

In [None]:
lr_model = model.stages[1]

In [None]:
lr_model.coefficients

In [None]:
lr_model.intercept

In [None]:
print("y =", lr_model.intercept, "(TV *", lr_model.coefficients[0],
      ") + (Radio *", lr_model.coefficients[1], ") + (Newspaper *", lr_model.coefficients[2], ")")

In [None]:
print("R2 Score: ", lr_model.summary.r2)

In [None]:
print("RMSE Score: ", lr_model.summary.rootMeanSquaredError)

In [None]:
print("MAE Score: ", lr_model.summary.meanAbsoluteError)

In [None]:
print("MSE Score: ", lr_model.summary.meanSquaredError)

In [None]:
import pandas as pd

test = {"TV": [10.0], "Radio": [15.0], "Newspaper": [20.0]}
testDF = pd.DataFrame(test)
testDF.head()

In [None]:
predictDF = spark.createDataFrame(testDF)
predictDF.show()

In [None]:
predictDF = vector_assembler.transform(predictDF)

In [None]:
lr_model.transform(predictDF).toPandas().head()