In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("LinearRegression") \
.master("local[4]") \
.config("spark.driver.memory","2g") \
.config("spark.executor.memory","4g") \
.getOrCreate()

# Verisetini okuma

In [3]:
df = spark.read.format("csv") \
.option("header",True) \
.option("sep",",") \
.option("inferSchema",True) \
.load("D:\\Datasets\\Advertising.csv")

In [4]:
df.toPandas().head()

Unnamed: 0,_c0,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [6]:
df2 = df.withColumn("Advertisement", df.TV + df.Radio + df.Newspaper) \
.withColumnRenamed("Sales","label") \
.drop("TV","Radio","Newspaper")

df2.toPandas().head()

Unnamed: 0,_c0,label,Advertisement
0,1,22.1,337.1
1,2,10.4,128.9
2,3,9.3,132.4
3,4,18.5,251.3
4,5,12.9,250.0


# Veri hazırlığı ve Keşfi

In [7]:
df2.describe("label","Advertisement").toPandas().head()

Unnamed: 0,summary,label,Advertisement
0,count,200.0,200.0
1,mean,14.022500000000004,200.86049999999992
2,stddev,5.217456565710477,92.9851805869837
3,min,1.6,11.7
4,max,27.0,433.6


# VectorAssembler

In [8]:
from pyspark.ml.feature import VectorAssembler
vector_assembler = VectorAssembler() \
.setInputCols(["Advertisement"]) \
.setOutputCol("features")

# Regresyon Modeli

In [10]:
from pyspark.ml.regression import LinearRegression
linear_reg_obj = LinearRegression() \
.setLabelCol("label")\
.setFeaturesCol("features")

# pipeline

In [11]:
from pyspark.ml import Pipeline
pipeline_obj = Pipeline() \
.setStages([vector_assembler, linear_reg_obj])

# veri setini ayırma

In [12]:
train_df, test_df = df2.randomSplit([0.8, 0.2], seed=142)

# Modeli Eğitme

In [13]:
pipeline_model = pipeline_obj.fit(train_df)

# Modeli test etme

In [14]:
result_df = pipeline_model.transform(test_df)

In [15]:
result_df.toPandas().head()

Unnamed: 0,_c0,label,Advertisement,features,prediction
0,3,9.3,132.4,[132.39999999999998],10.791216
1,6,7.2,132.6,[132.6],10.800663
2,9,4.8,11.7,[11.7],5.089785
3,10,10.6,223.6,[223.6],15.099174
4,17,12.5,218.4,[218.4],14.853545


In [16]:
pipeline_model.stages

[VectorAssembler_40dbb815aefb5a485c2c, LinearRegression_4d9dad77a164ffb2970c]

In [18]:
lr_model = pipeline_model.stages[1]

In [19]:
lr_model.coefficients

DenseVector([0.0472])

In [20]:
lr_model.intercept

4.537119328969264

In [21]:
lr_model.summary.r2

0.7204575410444246

In [24]:
lr_model.summary.pValues

[0.0, 3.1086244689504383e-15]

In [25]:
lr_model.summary.tValues

[19.986932892719693, 8.769612805482973]

In [27]:
lr_model.summary.rootMeanSquaredError

2.678303634403656

In [28]:
# y = 4.537119328969264 + 0.0472 * Advertisement

In [29]:
# 100 bin liralık bir reklam bütçesi ile ne kadar satış gerçekleşeceğini tahmin edelim.
df_predict_rdd = spark.sparkContext.parallelize([100.0])
df_predict = df_predict_rdd.map(lambda x: (x,)).toDF(["Advertisement"])

In [30]:
df_predict.show()

+-------------+
|Advertisement|
+-------------+
|        100.0|
+-------------+



In [32]:
df_pred_vec = vector_assembler.transform(df_predict)

In [34]:
lr_model.transform(df_pred_vec).toPandas().head()

Unnamed: 0,Advertisement,features,prediction
0,100.0,[100.0],9.260757
