In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder \
.appName("LinearRegression") \
.master("local[4]") \
.config("spark.driver.memory","2g") \
.config("spark.executor.memory","4g") \
.getOrCreate()

# Verisetini okuma

    // veriyi okuyarak dataframe oluşturma
    // Veri hakkında kısa bilgi: Bir ürünün satış miktarında kullanılan reklam bütçesine ait 200 adet veri
    // Veri kaynağı: https://www.kaggle.com/ishaanv/ISLR-Auto#Advertising.csv

In [4]:
df = spark.read.format("csv") \
.option("header",True) \
.option("sep",",") \
.option("inferSchema",True) \
.load("D:\\Datasets\\Advertising.csv")

In [5]:
df.toPandas().head()

Unnamed: 0,_c0,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


## TV, Radio, Newspaper sütunlarını toplayıp Advertisement adında yeni nitelik oluşturup bunları düşürelim

In [6]:
df2 = df.withColumn("Advertisement", (df.TV + df.Radio + df.Newspaper)) \
.withColumnRenamed("Sales","label") \
.drop("TV","Radio","Newspaper")

In [7]:
df2.toPandas().head()

Unnamed: 0,_c0,label,Advertisement
0,1,22.1,337.1
1,2,10.4,128.9
2,3,9.3,132.4
3,4,18.5,251.3
4,5,12.9,250.0


# Veri Hazırlığı

In [8]:
df2.describe("Advertisement","label").toPandas().head()

Unnamed: 0,summary,Advertisement,label
0,count,200.0,200.0
1,mean,200.86049999999992,14.022500000000004
2,stddev,92.9851805869837,5.217456565710477
3,min,11.7,1.6
4,max,433.6,27.0


### Yukarıdan boş değer olmadığı görünüyor. 

In [9]:
## Kategorik Nitelik yok. Dolayısıyla StringIndexer ve OneHotEncoder'ı atlıyoruz.

## VectorAssembler

In [10]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler() \
.setInputCols(["Advertisement"]) \
.setOutputCol("features")

# Regresyon Modeli

In [11]:
from pyspark.ml.regression import LinearRegression

In [12]:
linear_reg_obj = LinearRegression() \
.setLabelCol("label") \
.setFeaturesCol("features")

# Pipeline

In [13]:
from pyspark.ml import Pipeline

In [14]:
pipeline_obj = Pipeline() \
.setStages([vectorAssembler, linear_reg_obj])

# Veri Setini Ayırma

In [15]:
train_df, test_df = df2.randomSplit([0.8, 0.2], seed=142)

# Modeli Eğitme

In [16]:
pipeline_model = pipeline_obj.fit(train_df)

# Test verisiyle modeli transform etme

In [17]:
result_df = pipeline_model.transform(test_df)

In [18]:
result_df.toPandas().head()

Unnamed: 0,_c0,label,Advertisement,features,prediction
0,3,9.3,132.4,[132.39999999999998],10.791216
1,6,7.2,132.6,[132.6],10.800663
2,9,4.8,11.7,[11.7],5.089785
3,10,10.6,223.6,[223.6],15.099174
4,17,12.5,218.4,[218.4],14.853545


# LinearRegresyon Modelini Pipeline Stages arasından alma

In [19]:
lrModel = pipeline_model.stages[1]

In [20]:
lrModel.coefficients

DenseVector([0.0472])

In [21]:
lrModel.summary.r2

0.7204575410444246

In [22]:
lrModel.intercept

4.537119328969264

In [23]:
lrModel.summary.pValues

[0.0, 3.1086244689504383e-15]

In [24]:
lrModel.summary.tValues

[19.986932892719693, 8.769612805482973]

In [25]:
lrModel.summary.rootMeanSquaredError

2.678303634403656

# Tahmin

In [36]:
# 100 bin liralık bir reklam bütçesi ile ne kadar satış gerçekleşeceğini tahmin edelim.
df_predict_rdd = spark.sparkContext.parallelize([100.0])
df_predict = df_predict_rdd.map(lambda x: (x,)).toDF(["Advertisement"])

In [37]:
df_predict.show()

+-------------+
|Advertisement|
+-------------+
|        100.0|
+-------------+



In [38]:
df_pred_vec = vectorAssembler.transform(df_predict)

In [39]:
lrModel.transform(df_pred_vec).toPandas().head()

Unnamed: 0,Advertisement,features,prediction
0,100.0,[100.0],9.260757
