In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName("advertising").getOrCreate()

In [5]:
df = spark.read.csv("Advertising.csv", inferSchema=True, header=True)

In [7]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- TV: double (nullable = true)
 |-- radio: double (nullable = true)
 |-- newspaper: double (nullable = true)
 |-- sales: double (nullable = true)



In [12]:
df.describe().show()

+-------+------------------+-----------------+------------------+------------------+------------------+
|summary|               _c0|               TV|             radio|         newspaper|             sales|
+-------+------------------+-----------------+------------------+------------------+------------------+
|  count|               200|              200|               200|               200|               200|
|   mean|             100.5|         147.0425|23.264000000000024|30.553999999999995|14.022500000000003|
| stddev|57.879184513951124|85.85423631490805|14.846809176168728| 21.77862083852283| 5.217456565710477|
|    min|                 1|              0.7|               0.0|               0.3|               1.6|
|    max|               200|            296.4|              49.6|             114.0|              27.0|
+-------+------------------+-----------------+------------------+------------------+------------------+



In [24]:
#Although newspaper and sales are not high correlated, we take into the account and see what would it become in r2
df.corr("newspaper","sales")

0.22829902637616545

In [34]:
newspaper_sales_df = df.select("newspaper","sales")

assembler = VectorAssembler(inputCols=["newspaper"], outputCol="features")
newspaper_sales_df = assembler.transform(newspaper_sales_df)
newspaper_sales_df = newspaper_sales_df.select("features","sales")

In [35]:
newspaper_sales_df.head(2)

[Row(features=DenseVector([69.2]), sales=22.1),
 Row(features=DenseVector([45.1]), sales=10.4)]

In [25]:
from pyspark.ml.regression import LinearRegression

In [36]:
lr = LinearRegression(labelCol="sales")

In [38]:
train_data, test_data = newspaper_sales_df.randomSplit([0.7,0.3])

In [39]:
trained = lr.fit(train_data)

In [45]:
tested = trained.evaluate(test_data)

In [46]:
tested.r2

-0.006791076999167034

In [67]:
trained.summary.r2

#r2 value is quite low around 7%, meaning it is not really correlate between sales and news paper, but might have indirect relationship between?

0.0701587158140552

In [56]:
train_data.describe().show()

+-------+------------------+
|summary|             sales|
+-------+------------------+
|  count|               139|
|   mean|14.221582733812951|
| stddev| 5.245671267056159|
|    min|               3.2|
|    max|              26.2|
+-------+------------------+



In [66]:
trained.summary.coefficientStandardErrors 

# [newspaper, intercept]

[0.01966184691911921, 0.7651312597651112]

In [61]:
trained.summary.tValues

[3.215116297095368, 15.929486061299626]

In [62]:
trained.summary.pValues

[0.0016265212419017683, 0.0]