In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [2]:
training = spark.read.csv('test1.csv',header=True,inferSchema=True)
training.show()

+----+---+----------+------+
|Name|Age|Experience|Salary|
+----+---+----------+------+
|   a| 20|        10|  3000|
|   b| 30|         8|  2500|
|   c| 40|         4|  1000|
|   e| 25|         2|  3000|
|   r| 30|         2|  2000|
|   a| 20|         2|  1000|
+----+---+----------+------+



In [3]:
training.columns

['Name', 'Age', 'Experience', 'Salary']

In [4]:
#VectorAssembler grouped independent features and treated them as single feature 
#['Age',"Experience"] -------> New feature ------> independent feature
from pyspark.ml.feature import VectorAssembler
featureAssembler = VectorAssembler(inputCols=['Age',"Experience"],outputCol="Independent Features")

In [5]:
output = featureAssembler.transform(training)
output.show()

+----+---+----------+------+--------------------+
|Name|Age|Experience|Salary|Independent Features|
+----+---+----------+------+--------------------+
|   a| 20|        10|  3000|         [20.0,10.0]|
|   b| 30|         8|  2500|          [30.0,8.0]|
|   c| 40|         4|  1000|          [40.0,4.0]|
|   e| 25|         2|  3000|          [25.0,2.0]|
|   r| 30|         2|  2000|          [30.0,2.0]|
|   a| 20|         2|  1000|          [20.0,2.0]|
+----+---+----------+------+--------------------+



In [6]:
output.columns

['Name', 'Age', 'Experience', 'Salary', 'Independent Features']

In [7]:
finalized_data = output.select("Independent features","Salary")
finalized_data.show()

+--------------------+------+
|Independent features|Salary|
+--------------------+------+
|         [20.0,10.0]|  3000|
|          [30.0,8.0]|  2500|
|          [40.0,4.0]|  1000|
|          [25.0,2.0]|  3000|
|          [30.0,2.0]|  2000|
|          [20.0,2.0]|  1000|
+--------------------+------+



#### Train Test Split

In [9]:
from pyspark.ml.regression import LinearRegression
train_data,test_data = finalized_data.randomSplit([0.75,0.25])
regressor = LinearRegression(featuresCol="Independent features",labelCol='Salary')
regressor = regressor.fit(train_data)

In [10]:
regressor.coefficients

DenseVector([-86.3636, 45.4545])

In [11]:
regressor.intercept

4443.181818181816

#### Prediction

In [12]:
pred_res = regressor.evaluate(test_data)
pred_res.predictions.show()

+--------------------+------+-----------------+
|Independent features|Salary|       prediction|
+--------------------+------+-----------------+
|          [20.0,2.0]|  1000| 2806.81818181818|
|          [25.0,2.0]|  3000|2374.999999999998|
+--------------------+------+-----------------+



In [13]:
pred_res.meanAbsoluteError

1215.909090909091