# Regression Problem using PySpark

In [1]:
#Import the required libraries:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession 

In [3]:
spark = SparkSession.builder.appName('Regression').getOrCreate()
spark

In [5]:
#To load the csv file:
df_espark = spark.read.csv('Employee.csv',header=True,inferSchema=True)
df_espark.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Rahul| 27|         4| 48856|
|    Akash| 21|         4| 31376|
|    Suman| 26|         5| 47800|
|     Abhi| 28|         4| 41331|
|   Sanjay| 25|         3| 42705|
|    Akash| 26|         3| 31876|
|   Aayush| 23|         3| 40622|
|     Aman| 21|         4| 33124|
|   Preeti| 23|         5| 39764|
|  Kanchan| 28|         2| 31630|
|    Sudha| 23|         2| 30076|
|    Krish| 25|         2| 32973|
|Sudhanshu| 22|         2| 48528|
|      Sam| 26|         4| 40971|
|    Raman| 22|         3| 34320|
|    Rajan| 22|         5| 39867|
|    Rohit| 24|         2| 30006|
|   Rajesh| 22|         2| 34010|
|  Aarushi| 27|         2| 35077|
|    Arika| 23|         5| 42722|
+---------+---+----------+------+
only showing top 20 rows



In [7]:
#Print schema to check the data types of columns:
df_espark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [11]:
#To check the number of columns in dataframe:
len(df_espark.columns)

4

In [15]:
#To check the number of rows in dataframe:
df_espark.count()

36

In [19]:
#To check the five point summary:
df_espark.describe().show()

+-------+-------+------------------+-----------------+-----------------+
|summary|   Name|               Age|       Experience|           Salary|
+-------+-------+------------------+-----------------+-----------------+
|  count|     36|                36|               36|               36|
|   mean|   null| 23.97222222222222|3.111111111111111|39079.52777777778|
| stddev|   null|2.3843071603551063|1.115546702045434|6189.434968377788|
|    min|Aarushi|                20|                1|            30006|
|    max|Tamanna|                28|                5|            48856|
+-------+-------+------------------+-----------------+-----------------+



In [22]:
#To check all the columns:
df_espark.columns

['Name', 'Age', 'Experience', 'Salary']

In [29]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Age', 'Experience'],outputCol='Independent features') 


In [32]:
output = assembler.transform(df_espark)
output.show()

+---------+---+----------+------+--------------------+
|     Name|Age|Experience|Salary|Independent features|
+---------+---+----------+------+--------------------+
|    Rahul| 27|         4| 48856|          [27.0,4.0]|
|    Akash| 21|         4| 31376|          [21.0,4.0]|
|    Suman| 26|         5| 47800|          [26.0,5.0]|
|     Abhi| 28|         4| 41331|          [28.0,4.0]|
|   Sanjay| 25|         3| 42705|          [25.0,3.0]|
|    Akash| 26|         3| 31876|          [26.0,3.0]|
|   Aayush| 23|         3| 40622|          [23.0,3.0]|
|     Aman| 21|         4| 33124|          [21.0,4.0]|
|   Preeti| 23|         5| 39764|          [23.0,5.0]|
|  Kanchan| 28|         2| 31630|          [28.0,2.0]|
|    Sudha| 23|         2| 30076|          [23.0,2.0]|
|    Krish| 25|         2| 32973|          [25.0,2.0]|
|Sudhanshu| 22|         2| 48528|          [22.0,2.0]|
|      Sam| 26|         4| 40971|          [26.0,4.0]|
|    Raman| 22|         3| 34320|          [22.0,3.0]|
|    Rajan

In [33]:
output.columns

['Name', 'Age', 'Experience', 'Salary', 'Independent features']

In [36]:
final_data = output.select('Independent features','Salary')
final_data.show()

+--------------------+------+
|Independent features|Salary|
+--------------------+------+
|          [27.0,4.0]| 48856|
|          [21.0,4.0]| 31376|
|          [26.0,5.0]| 47800|
|          [28.0,4.0]| 41331|
|          [25.0,3.0]| 42705|
|          [26.0,3.0]| 31876|
|          [23.0,3.0]| 40622|
|          [21.0,4.0]| 33124|
|          [23.0,5.0]| 39764|
|          [28.0,2.0]| 31630|
|          [23.0,2.0]| 30076|
|          [25.0,2.0]| 32973|
|          [22.0,2.0]| 48528|
|          [26.0,4.0]| 40971|
|          [22.0,3.0]| 34320|
|          [22.0,5.0]| 39867|
|          [24.0,2.0]| 30006|
|          [22.0,2.0]| 34010|
|          [27.0,2.0]| 35077|
|          [23.0,5.0]| 42722|
+--------------------+------+
only showing top 20 rows



In [47]:
#Import linear regressor from PySpark
from pyspark.ml.regression import LinearRegression

#Train test split:
train_data, test_data = final_data.randomSplit([0.7,0.3])


In [46]:
#Initialize a linear regressor:
reg = LinearRegression(featuresCol="Independent features", labelCol='Salary' )
reg = reg.fit(train_data)


In [49]:
#To check the coefficients:
reg.coefficients

DenseVector([19.6232, 650.7703])

In [50]:
#To check the intercept:
reg.intercept

36427.93170161437

In [55]:
#Predict
pred = reg.evaluate(test_data)
pred.predictions.show()

+--------------------+------+------------------+
|Independent features|Salary|        prediction|
+--------------------+------+------------------+
|          [20.0,4.0]| 35601| 39423.47710816115|
|          [21.0,2.0]| 32978|38141.559699803925|
|          [22.0,2.0]| 42350| 38161.18290843267|
|          [22.0,2.0]| 48684| 38161.18290843267|
|          [22.0,5.0]| 39867| 40113.49383391161|
|          [23.0,5.0]| 39764|40133.117042540354|
|          [25.0,4.0]| 35638|39521.593151304856|
|          [26.0,3.0]| 41154| 38890.44605144062|
|          [27.0,2.0]| 35077| 38259.29895157638|
|          [27.0,4.0]| 48856| 39560.83956856234|
+--------------------+------+------------------+





In [59]:
#To check model performance
pred.meanAbsoluteError

4293.7888350429985

In [58]:
pred.meanSquaredError

28647956.109256286