In [21]:
from pyspark.sql import SparkSession

In [22]:
spark=SparkSession.builder.appName('Agg').getOrCreate()

In [23]:
spark

In [24]:
train=spark.read.csv('groupbypyspark.csv',header=True,inferSchema=True)

In [25]:
train.show()

+---------+------------+------+---+----------+
|     Name| Departments|Salary|Age|Experience|
+---------+------------+------+---+----------+
|    Krish|Data Science| 10000| 23|         3|
|    Krish|         IOT|  5000| 30|         4|
|   Mahesh|    Big Data|  4000| 34|         1|
|    Krish|    Big Data|  4000| 27|         3|
|   Mahesh|Data Science|  3000| 23|         1|
|Sudhanshu|Data Science| 20000| 29|         6|
|Sudhanshu|         IOT| 10000| 22|         1|
|    sunny|Data Science| 10000| 21|         1|
|Sudhanshu|    Big Data|  5000| 36|        10|
+---------+------------+------+---+----------+



In [26]:
train.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Departments: string (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [27]:
train.columns

['Name', 'Departments', 'Salary', 'Age', 'Experience']

In [28]:
from pyspark.ml.feature import VectorAssembler
featureAssembler=VectorAssembler(inputCols=['Age','Experience'],outputCol='Independen Features')

In [29]:
output=featureAssembler.transform(train)

In [30]:
output.show()

+---------+------------+------+---+----------+-------------------+
|     Name| Departments|Salary|Age|Experience|Independen Features|
+---------+------------+------+---+----------+-------------------+
|    Krish|Data Science| 10000| 23|         3|         [23.0,3.0]|
|    Krish|         IOT|  5000| 30|         4|         [30.0,4.0]|
|   Mahesh|    Big Data|  4000| 34|         1|         [34.0,1.0]|
|    Krish|    Big Data|  4000| 27|         3|         [27.0,3.0]|
|   Mahesh|Data Science|  3000| 23|         1|         [23.0,1.0]|
|Sudhanshu|Data Science| 20000| 29|         6|         [29.0,6.0]|
|Sudhanshu|         IOT| 10000| 22|         1|         [22.0,1.0]|
|    sunny|Data Science| 10000| 21|         1|         [21.0,1.0]|
|Sudhanshu|    Big Data|  5000| 36|        10|        [36.0,10.0]|
+---------+------------+------+---+----------+-------------------+



In [31]:
output.columns

['Name', 'Departments', 'Salary', 'Age', 'Experience', 'Independen Features']

In [33]:
finalized_data=output.select("Independen Features",'Salary')

In [34]:
finalized_data.show()

+-------------------+------+
|Independen Features|Salary|
+-------------------+------+
|         [23.0,3.0]| 10000|
|         [30.0,4.0]|  5000|
|         [34.0,1.0]|  4000|
|         [27.0,3.0]|  4000|
|         [23.0,1.0]|  3000|
|         [29.0,6.0]| 20000|
|         [22.0,1.0]| 10000|
|         [21.0,1.0]| 10000|
|        [36.0,10.0]|  5000|
+-------------------+------+



In [38]:
from pyspark.ml.regression import LinearRegression
train_data,test_data=finalized_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='Independen Features',labelCol='Salary')
regressor=regressor.fit(train_data)

In [39]:
regressor.coefficients

DenseVector([-547.1391, 967.3476])

In [40]:
regressor.intercept

19869.74654573199

In [42]:
pred_results=regressor.evaluate(test_data)

In [43]:
pred_results.predictions.show()

+-------------------+------+-----------------+
|Independen Features|Salary|       prediction|
+-------------------+------+-----------------+
|         [22.0,1.0]| 10000|8800.033906925491|
|         [27.0,3.0]|  4000|7999.033652623548|
+-------------------+------+-----------------+



In [44]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(2599.499872849028, 8716094.389672069)