# Current Notebook is to identify whether 2 Vectors have Synergy whereby:
* 1st predictor include with 2nd predictor as Interaction obtained more R2 compare with
* Separate as different vector

### This practice is specified in Introduction to Statistical Learning
---

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import (VectorAssembler,Interaction)

In [2]:
spark = SparkSession.builder.appName("interaction").getOrCreate()

df = spark.read.csv("Advertising.csv", inferSchema=True, header=True)
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- TV: double (nullable = true)
 |-- radio: double (nullable = true)
 |-- newspaper: double (nullable = true)
 |-- sales: double (nullable = true)



### Using Vector Assembler to assemble tv & radio, as well as interaction

In [3]:
assembler = VectorAssembler(inputCols=["TV", "radio"], outputCol="features")

processed_df = assembler.transform(df)
processed_df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- TV: double (nullable = true)
 |-- radio: double (nullable = true)
 |-- newspaper: double (nullable = true)
 |-- sales: double (nullable = true)
 |-- features: vector (nullable = true)



In [4]:
interaction = Interaction(inputCols=["TV","radio"], outputCol="features_interact")

processed_df = interaction.transform(processed_df)
processed_df.printSchema()
processed_df.show(2)

root
 |-- _c0: integer (nullable = true)
 |-- TV: double (nullable = true)
 |-- radio: double (nullable = true)
 |-- newspaper: double (nullable = true)
 |-- sales: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- features_interact: vector (nullable = true)

+---+-----+-----+---------+-----+------------+-------------------+
|_c0|   TV|radio|newspaper|sales|    features|  features_interact|
+---+-----+-----+---------+-----+------------+-------------------+
|  1|230.1| 37.8|     69.2| 22.1|[230.1,37.8]|[8697.779999999999]|
|  2| 44.5| 39.3|     45.1| 10.4| [44.5,39.3]|          [1748.85]|
+---+-----+-----+---------+-----+------------+-------------------+
only showing top 2 rows



### Here we required 2 df, 1 for sales vs features and another for sales vs features_interact
---

In [5]:
from pyspark.sql.functions import col

normal_df = processed_df.select("features", col("sales").alias("label"))
normal_df.show(2)

interact_df = processed_df.select(col("features_interact").alias("features"), col("sales").alias("label"))
interact_df.show(2)

+------------+-----+
|    features|label|
+------------+-----+
|[230.1,37.8]| 22.1|
| [44.5,39.3]| 10.4|
+------------+-----+
only showing top 2 rows

+-------------------+-----+
|           features|label|
+-------------------+-----+
|[8697.779999999999]| 22.1|
|          [1748.85]| 10.4|
+-------------------+-----+
only showing top 2 rows



### Create 2 linear regression and compare with r2 
---

In [6]:
lr_normal = LinearRegression()
normal_train,normal_test = normal_df.randomSplit([0.7,0.3])

lr_interact = LinearRegression()
interact_train,interact_test = interact_df.randomSplit([0.7,0.3])

In [7]:
normal_trained = lr_normal.fit(normal_train)
interact_trained = lr_interact.fit(interact_train)

### In here, we could conclude that with the combination of tv and radio advertising boosts more sales compared to solely tv and radio in separate advertising
---

In [8]:
print(f"Normal Trained Model R2 is {normal_trained.summary.r2}")
print(f"Interactive Trained Model R2 is {interact_trained.summary.r2}")

Normal Trained Model R2 is 0.8998959891837364
Interactive Trained Model R2 is 0.9253579071296237


In [9]:
normal_tested = normal_trained.evaluate(normal_test)
interact_tested = interact_trained.evaluate(interact_test)

In [10]:
print(f"Normal Evaluate Model R2 is {normal_tested.r2}")
print(f"Interactive Evaluate Model R2 is {interact_tested.r2}")

Normal Evaluate Model R2 is 0.886010553381919
Interactive Evaluate Model R2 is 0.9357380640684074


### Check on pValue

In [11]:
normal_trained.summary.pValues

[0.0, 0.0, 3.97237798210881e-13]

In [12]:
interact_trained.summary.pValues

[0.0, 0.0]

In [13]:
# rmse test < rmse trained == underfit
print(f"Normal RMSE Test is {normal_tested.rootMeanSquaredError}")
normal_test.describe().show()

print(f"Normal RMSE Trained is {normal_trained.summary.rootMeanSquaredError}")
normal_train.describe().show()

Normal RMSE Test is 1.6848141048102707
+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|                56|
|   mean|13.819642857142853|
| stddev|5.0353809874041975|
|    min|               5.3|
|    max|              27.0|
+-------+------------------+

Normal RMSE Trained is 1.671576175991367
+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|               144|
|   mean|14.101388888888899|
| stddev| 5.301681829884761|
|    min|               1.6|
|    max|              26.2|
+-------+------------------+



In [15]:
# rmse test < rmse trained == underfit
print(f"Interact RMSE Test is {interact_tested.rootMeanSquaredError}")
interact_test.describe().show()

print(f"Intearct RMSE Trained is {interact_trained.summary.rootMeanSquaredError}")
interact_train.describe().show()

Interact RMSE Test is 1.3613198384839333
+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|                54|
|   mean|14.892592592592594|
| stddev| 5.420535789567471|
|    min|               3.2|
|    max|              27.0|
+-------+------------------+

Intearct RMSE Trained is 1.3945350910289491
+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|               146|
|   mean|13.700684931506851|
| stddev| 5.121886729083775|
|    min|               1.6|
|    max|              26.2|
+-------+------------------+



In [24]:
interact_test.show(2)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[47.559999999999995]|  3.2|
|            [133.98]|  9.5|
+--------------------+-----+
only showing top 2 rows

