In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
.appName("LineerRegresyon") \
.master("local[4]") \
.config("spark.driver.memory","2g") \
.config("spark.executor.memory","4g") \
.getOrCreate()

In [4]:
df = spark.read.format("csv") \
.option("header",True) \
.option("sep",",") \
.option("inferSchema",True) \
.load("/home/taha/Downloads/Advertising.csv")

In [6]:
# veri yi okuyalım
df.toPandas().head()

Unnamed: 0,_c0,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [7]:
df2 = df.withColumn("Advertisement",df.TV+df.Radio+df.Newspaper) \
.withColumnRenamed("Sales","label") \
.drop("TV","Radio","Newspaper")

In [8]:
df2.toPandas().head()

Unnamed: 0,_c0,label,Advertisement
0,1,22.1,337.1
1,2,10.4,128.9
2,3,9.3,132.4
3,4,18.5,251.3
4,5,12.9,250.0


# Veri Hazırlığı Ve Keşfi

In [9]:
df2.describe("label","Advertisement").toPandas().head()

Unnamed: 0,summary,label,Advertisement
0,count,200.0,200.0
1,mean,14.022500000000004,200.86049999999992
2,stddev,5.217456565710477,92.9851805869837
3,min,1.6,11.7
4,max,27.0,433.6


# VectorAssembler

In [10]:
# stringIndexer ve OneHOTEncodder kulllanmamızın sebebi
# kategorik degisken olmaması

from pyspark.ml.feature import VectorAssembler
vector_assembler = VectorAssembler() \
.setInputCols(["Advertisement"]) \
.setOutputCol("features")

# Regresyon Modeli

In [11]:
from pyspark.ml.regression import LinearRegression

In [12]:
linear_reg_obj = LinearRegression() \
.setLabelCol("label") \
.setFeaturesCol("features")

# pipeline

In [14]:
from pyspark.ml import Pipeline

In [16]:
pipeline_obj = Pipeline() \
.setStages([vector_assembler,linear_reg_obj])


# veri seti ayırma

In [17]:
train_df,test_df = df2.randomSplit([0.8,0.2],seed = 142)

# MOdeli Eğitme

In [18]:
pipeline_model = pipeline_obj.fit(test_df)

# Model Test

In [19]:
result_df = pipeline_model.transform(train_df)

In [20]:
result_df.toPandas().head()

Unnamed: 0,_c0,label,Advertisement,features,prediction
0,1,22.1,337.1,[337.09999999999997],21.226067
1,2,10.4,128.9,[128.9],10.144271
2,4,18.5,251.3,[251.3],16.659217
3,5,12.9,250.0,[250.00000000000003],16.590022
4,7,11.8,113.8,[113.8],9.340548


In [21]:
# bu tahminler ortalamaya gore daha iyi tahminlerdir

In [22]:
pipeline_model.stages

[VectorAssembler_486aadf5945278f87ad3, LinearRegression_4931995b32adf12d5ca5]

In [23]:
lr_model = pipeline_model.stages[1]

In [24]:
lr_model.coefficients

DenseVector([0.0532])

In [25]:
lr_model.intercept


3.2833514963338155

In [26]:
lr_model.summary.r2

0.8548160234280924

In [27]:
lr_model.summary.pValues

[0.0, 0.00015132518985394583]

In [28]:
lr_model.summary.tValues

[15.537062465678869, 4.175254812869891]

In [29]:
lr_model.summary.rootMeanSquaredError

2.1613647126515607

In [30]:
# ornek
# 100 bin liralık bir reklam butçesiyle ne kadarlık satıi gercekleiebilir

In [31]:
df_predict_rdd = spark.sparkContext.parallelize([100.0])

In [40]:
df_predict = df_predict_rdd.map(lambda x :(x,)).toDF(["Advertisement"])

In [41]:
df_predict.toPandas().head()

Unnamed: 0,Advertisement
0,100.0


In [42]:
df_pred_vec = vector_assembler.transform(df_predict)

In [44]:
lr_model.transform(df_pred_vec).toPandas().head()

Unnamed: 0,Advertisement,features,prediction
0,100.0,[100.0],8.60602
