## Example ML problem to predict Salary

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Missing').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/19 16:22:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Load the dataset

In [3]:
df_spark = spark.read.csv('Dataset/test5.csv',header=True,inferSchema=True)

In [4]:
df_spark.show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
|  Jane| 26|         5| 87000|
|  Liza| 28|         3|110000|
|   Ram| 27|         7| 98000|
|  Abey| 21|         6| 93000|
|  Nick| 32|         9| 90000|
|Arijit| 30|         3|103000|
+------+---+----------+------+



In [5]:
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [6]:
df_spark.columns

['Name', 'Age', 'Experience', 'Salary']

In [7]:
from pyspark.ml.feature import VectorAssembler

In [8]:
featureassembler = VectorAssembler(inputCols = ['Age','Experience'], outputCol ='Independent_Feature')

In [9]:
df_spark = featureassembler.transform(df_spark)

In [10]:
df_spark.show()

+------+---+----------+------+-------------------+
|  Name|Age|Experience|Salary|Independent_Feature|
+------+---+----------+------+-------------------+
|  Jane| 26|         5| 87000|         [26.0,5.0]|
|  Liza| 28|         3|110000|         [28.0,3.0]|
|   Ram| 27|         7| 98000|         [27.0,7.0]|
|  Abey| 21|         6| 93000|         [21.0,6.0]|
|  Nick| 32|         9| 90000|         [32.0,9.0]|
|Arijit| 30|         3|103000|         [30.0,3.0]|
+------+---+----------+------+-------------------+



In [11]:
df_spark.columns

['Name', 'Age', 'Experience', 'Salary', 'Independent_Feature']

In [12]:
feature_select = df_spark.select('Salary','Independent_Feature')

In [13]:
feature_select.show()

+------+-------------------+
|Salary|Independent_Feature|
+------+-------------------+
| 87000|         [26.0,5.0]|
|110000|         [28.0,3.0]|
| 98000|         [27.0,7.0]|
| 93000|         [21.0,6.0]|
| 90000|         [32.0,9.0]|
|103000|         [30.0,3.0]|
+------+-------------------+



In [14]:
from pyspark.ml.regression import LinearRegression

## Split the data

In [15]:
train_data,test_data = feature_select.randomSplit([0.75,0.25])

In [16]:
regressor = LinearRegression(featuresCol='Independent_Feature',labelCol='Salary')

In [17]:
regressor = regressor.fit(train_data)

24/08/19 16:23:01 WARN Instrumentation: [cd0637e1] regParam is zero, which might cause numerical instability and overfitting.
24/08/19 16:23:01 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/08/19 16:23:01 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/08/19 16:23:01 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


## Coefficients, Intercept, Predictions, metrics

In [18]:
regressor.coefficients

DenseVector([433.0651, -1551.5728])

In [19]:
regressor.intercept

91730.06583760223

In [20]:
results = regressor.evaluate(test_data)

In [21]:
results.predictions.show()

+------+-------------------+-----------------+
|Salary|Independent_Feature|       prediction|
+------+-------------------+-----------------+
|110000|         [28.0,3.0]|99201.17044623256|
+------+-------------------+-----------------+



In [22]:
results.meanAbsoluteError

10798.829553767442

In [23]:
results.meanSquaredError

116614719.73132113