In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
.appName("MultipleLineerRegresyon") \
.master("local[4]") \
.config("spark.driver.memory","2g") \
.config("spark.executor.memory","4g") \
.getOrCreate()

# Veri Seti OKuma

In [3]:
df = spark.read.format("csv") \
.option("header",True) \
.option("sep",",") \
.option("inferSchema",True) \
.load("/home/taha/Downloads/Advertising.csv")

In [4]:
df.toPandas().head()

Unnamed: 0,_c0,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


# Stun Isımlerini Degistirelim

In [5]:
yeni_nitelikler = ["id","TV","Radio","Newspaper","label"]

In [6]:
df2 = df.selectExpr("_c0 as id","TV","Radio","Newspaper","Sales as label")

In [7]:
df2.toPandas().head()

Unnamed: 0,id,TV,Radio,Newspaper,label
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [8]:
numerik_nitelikler = ["TV","Radio"]
label = ["label"]
# kontrollerden sonra gazeteyi çıkardık basta varı

In [9]:
df2.describe().toPandas().head()

Unnamed: 0,summary,id,TV,Radio,Newspaper,label
0,count,200.0,200.0,200.0,200.0,200.0
1,mean,100.5,147.0425,23.264000000000024,30.553999999999995,14.022500000000004
2,stddev,57.87918451395112,85.85423631490805,14.846809176168728,21.77862083852283,5.217456565710477
3,min,1.0,0.7,0.0,0.3,1.6
4,max,200.0,296.4,49.6,114.0,27.0


# Veri Hazırlıgı

In [10]:
from pyspark.ml.feature import VectorAssembler

In [11]:
vector_assembler = VectorAssembler() \
.setInputCols(numerik_nitelikler) \
.setOutputCol("features")

# Regresyon Modeli

In [12]:
from pyspark.ml.regression import LinearRegression

In [13]:
lr_obj = LinearRegression() \
.setFeaturesCol("features") \
.setLabelCol("label")

# Pipeline Model

In [14]:
from pyspark.ml import Pipeline

In [15]:
pipeline_obj = Pipeline() \
.setStages([vector_assembler,lr_obj])


# Veri Setini Ayırma

In [16]:
train_df , test_df = df2.randomSplit([0.8,0.2], seed =142)

# Model Eğitmi

In [17]:
pipeline_model = pipeline_obj.fit(train_df)

# Model Test

In [18]:
result_df = pipeline_model.transform(test_df)

In [19]:
result_df.toPandas().head()

Unnamed: 0,id,TV,Radio,Newspaper,label,features,prediction
0,3,17.2,45.9,69.3,9.3,"[17.2, 45.9]",12.77388
1,6,8.7,48.9,75.0,7.2,"[8.7, 48.9]",12.991416
2,9,8.6,2.1,1.0,4.8,"[8.6, 2.1]",3.73113
3,10,199.8,2.6,21.2,10.6,"[199.8, 2.6]",12.283048
4,17,67.8,36.6,114.0,12.5,"[67.8, 36.6]",13.17162


# Pipeline Model Icinden Lineer Modeli Almak

In [20]:
lr_model = pipeline_model.stages[1]

In [21]:
lr_model.coefficients

DenseVector([0.0442, 0.1978])

In [22]:
lr_model.intercept

2.935593134859488

In [23]:
lr_model.summary.r2

0.8928931248714045

In [24]:
lr_model.summary.rootMeanSquaredError

1.6578475603790448

In [25]:
lr_model.summary.pValues
# tv - radio - gazete - sabit
# tv - radio - sabit

[0.0, 0.0, 3.774758283725532e-15]

In [26]:
lr_model.summary.tValues

[27.918094216203865, 21.216582516976807, 8.740412243937218]

# Model Secimi

In [27]:
# geriye doğru eleme yontemi
# modeli tum değişkenleri dahil et
# p degelerine bak esik degerin uzerinde kalan en buyuk degeri çıkar
# durumları tekrar et
# p genelde 0.05

In [28]:
# gorulcegi gibi gazete baya baya gecmis 
# nitelikler listesinden gazeteyi cikarip
# islemleri tekrarlıyorum

In [29]:
# bu islemlerden sonra anlamlı deger elde ettik
# r2  degerimiz hemen hemen hic degismedi
# yapilan islem dogru

In [30]:
# makinemiz
# y = 2.935593134859488 + (0.0442*TV) + (0.1978*Radio)

# Prediction

In [31]:
# 100 bin tv , 10 bin radio icin butce ayirsak

In [32]:
import pandas as pd

In [33]:
d = {"TV":[100.0],"Radio":[10.0]}
pd_df = pd.DataFrame(data=d)
pd_df.head()

Unnamed: 0,TV,Radio
0,100.0,10.0


In [34]:
predict_df = spark.createDataFrame(pd_df)
predict_df.show()

+-----+-----+
|   TV|Radio|
+-----+-----+
|100.0| 10.0|
+-----+-----+



In [35]:
predict_df.toPandas().head()

Unnamed: 0,TV,Radio
0,100.0,10.0


In [36]:
predict_df_vec = vector_assembler.transform(predict_df)

In [37]:
lr_model.transform(predict_df_vec).show()

+-----+-----+------------+-----------------+
|   TV|Radio|    features|       prediction|
+-----+-----+------------+-----------------+
|100.0| 10.0|[100.0,10.0]|9.334383277881834|
+-----+-----+------------+-----------------+

