In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler, PCA, StringIndexer, OneHotEncoder, ChiSqSelector
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [2]:
spark = SparkSession.builder.appName("LinearRegression").getOrCreate()

In [3]:
!ls

bitcoin.csv		  Pipeline_FeatureEngineering.ipynb
LinearRegression2.ipynb   PracticeUAS.ipynb
LinearRegression.ipynb	  student_academic_placement_performance_dataset.csv
LogisticRegression.ipynb  work


In [6]:
df = spark.read.csv("./student_academic_placement_performance_dataset.csv", header = True, inferSchema = True)
df.show()

+----------+------+--------------+--------------+-----------------+----+-------------------+---------------------+----------------+----------------+-------------+----------------------+--------------+---------------------+--------+--------------------------+----------------+------------------+
|student_id|gender|ssc_percentage|hsc_percentage|degree_percentage|cgpa|entrance_exam_score|technical_skill_score|soft_skill_score|internship_count|live_projects|work_experience_months|certifications|attendance_percentage|backlogs|extracurricular_activities|placement_status|salary_package_lpa|
+----------+------+--------------+--------------+-----------------+----+-------------------+---------------------+----------------+----------------+-------------+----------------------+--------------+---------------------+--------+--------------------------+----------------+------------------+
|         1|  Male|            53|            79|               56|8.87|                 50|                   92| 

In [7]:
df.printSchema()

root
 |-- student_id: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- ssc_percentage: integer (nullable = true)
 |-- hsc_percentage: integer (nullable = true)
 |-- degree_percentage: integer (nullable = true)
 |-- cgpa: double (nullable = true)
 |-- entrance_exam_score: integer (nullable = true)
 |-- technical_skill_score: integer (nullable = true)
 |-- soft_skill_score: integer (nullable = true)
 |-- internship_count: integer (nullable = true)
 |-- live_projects: integer (nullable = true)
 |-- work_experience_months: integer (nullable = true)
 |-- certifications: integer (nullable = true)
 |-- attendance_percentage: integer (nullable = true)
 |-- backlogs: integer (nullable = true)
 |-- extracurricular_activities: string (nullable = true)
 |-- placement_status: integer (nullable = true)
 |-- salary_package_lpa: double (nullable = true)



In [8]:
df.describe().show()

+-------+------------------+------+-----------------+------------------+------------------+-----------------+-------------------+---------------------+------------------+------------------+------------------+----------------------+------------------+---------------------+------------------+--------------------------+-------------------+------------------+
|summary|        student_id|gender|   ssc_percentage|    hsc_percentage| degree_percentage|             cgpa|entrance_exam_score|technical_skill_score|  soft_skill_score|  internship_count|     live_projects|work_experience_months|    certifications|attendance_percentage|          backlogs|extracurricular_activities|   placement_status|salary_package_lpa|
+-------+------------------+------+-----------------+------------------+------------------+-----------------+-------------------+---------------------+------------------+------------------+------------------+----------------------+------------------+---------------------+------------

In [9]:
df = df.drop("student_id")

In [81]:
df.groupBy('backlogs').count().show()

+--------+-----+
|backlogs|count|
+--------+-----+
|       1|  794|
|       3|  829|
|       5|  855|
|       4|  852|
|       2|  814|
|       0|  856|
+--------+-----+



In [11]:
df = df.withColumn("extracurricular_activities", when(col('extracurricular_activities') == 'Yes', 1).otherwise(0).cast('int'))

In [12]:
df.select("extracurricular_activities").show()

+--------------------------+
|extracurricular_activities|
+--------------------------+
|                         1|
|                         0|
|                         0|
|                         0|
|                         0|
|                         0|
|                         1|
|                         0|
|                         0|
|                         0|
|                         0|
|                         1|
|                         0|
|                         1|
|                         1|
|                         1|
|                         0|
|                         1|
|                         1|
|                         0|
+--------------------------+
only showing top 20 rows



In [13]:
df.groupBy("placement_status").count().show()

+----------------+-----+
|placement_status|count|
+----------------+-----+
|               1|  866|
|               0| 4134|
+----------------+-----+



In [14]:
df.columns

['gender',
 'ssc_percentage',
 'hsc_percentage',
 'degree_percentage',
 'cgpa',
 'entrance_exam_score',
 'technical_skill_score',
 'soft_skill_score',
 'internship_count',
 'live_projects',
 'work_experience_months',
 'certifications',
 'attendance_percentage',
 'backlogs',
 'extracurricular_activities',
 'placement_status',
 'salary_package_lpa']

In [36]:
indexer = StringIndexer(inputCol = 'gender', outputCol = 'gender_idx')
ohe = OneHotEncoder(inputCol = 'gender_idx', outputCol = 'gender_enc')

assembler_num = VectorAssembler(inputCols = ['ssc_percentage',
 'hsc_percentage',
 'degree_percentage',
 'cgpa',
 'entrance_exam_score',
 'technical_skill_score',
 'soft_skill_score',
 'internship_count',
 'live_projects',
 'work_experience_months',
 'certifications',
 'attendance_percentage',
 'backlogs'], outputCol = "num_features")
scaler = StandardScaler(inputCol = "num_features", outputCol = "scaled_features", withMean = True, withStd = True)
assembler_bool = VectorAssembler(inputCols = ['placement_status', 'extracurricular_activities'], outputCol = 'bool_features')

features_cols = ['gender_enc', 'scaled_features', 'bool_features']
assembler_all = VectorAssembler(inputCols = features_cols, outputCol = 'x')

lm = LinearRegression(featuresCol = 'x', labelCol = 'salary_package_lpa')

In [37]:
pipeline = Pipeline(stages = [indexer, ohe, assembler_num, scaler, assembler_bool, assembler_all, lm])

In [38]:
train, test = df.randomSplit([0.7, 0.3], seed = 123)

In [39]:
model = pipeline.fit(train)

In [40]:
pred = model.transform(test)

In [59]:
pred.select('salary_package_lpa', 'x', 'prediction').show(truncate = False)

+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
|salary_package_lpa|x                                                                                                                                                                                                                                                                               |prediction            |
+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
|0.0               |[0.0,-1.6975189521231755,-1.5

In [43]:
eval = RegressionEvaluator(labelCol = "salary_package_lpa", predictionCol = 'prediction', metricName = 'rmse')
rmse = eval.evaluate(pred)
print('RMSE = {:.4f}'.format(rmse))
eval = RegressionEvaluator(labelCol = "salary_package_lpa", predictionCol = 'prediction', metricName = 'mse')
mse = eval.evaluate(pred)
print('MSE = {:.4f}'.format(mse))
eval = RegressionEvaluator(labelCol = "salary_package_lpa", predictionCol = 'prediction', metricName = 'mae')
mae = eval.evaluate(pred)
print('MAE = {:.4f}'.format(mae))
eval = RegressionEvaluator(labelCol = "salary_package_lpa", predictionCol = 'prediction', metricName = 'r2')
r2 = eval.evaluate(pred)
print('R2 = {:.2f}%'.format(r2*100))

RMSE = 1.4314
MSE = 2.0490
MAE = 0.5584
R2 = 84.51%


In [79]:
paramGrid = (ParamGridBuilder().addGrid(lm.regParam, [0.05, 0.5, 0.1]).addGrid(lm.elasticNetParam, [0.0, 0.5, 1.0]).build())
eval = RegressionEvaluator(labelCol = 'salary_package_lpa', predictionCol = 'prediction', metricName = 'rmse')

In [80]:
cv = CrossValidator(estimator = pipeline, estimatorParamMaps = paramGrid, evaluator = eval, numFolds = 3)

In [81]:
tuning_model = cv.fit(train)

In [82]:
pred_tuning = tuning_model.transform(test)

In [87]:
pred_tuning.select('x', 'prediction').show()

+--------------------+--------------------+
|                   x|          prediction|
+--------------------+--------------------+
|[0.0,-1.697518952...|0.023285709332822896|
|[0.0,-1.697518952...|0.023285709332822896|
|[0.0,-1.697518952...|0.023285709332822896|
|[0.0,-1.697518952...|0.023285709332822896|
|[0.0,-1.697518952...|0.023285709332822896|
|[0.0,-1.697518952...|0.023285709332822896|
|[0.0,-1.697518952...|0.023285709332822896|
|[0.0,-1.697518952...|0.023285709332822896|
|[0.0,-1.697518952...|0.023285709332822896|
|[0.0,-1.697518952...|   9.158744080071292|
|[0.0,-1.697518952...|0.023285709332822896|
|[0.0,-1.697518952...|0.023285709332822896|
|[0.0,-1.697518952...|0.023285709332822896|
|[0.0,-1.697518952...|0.023285709332822896|
|[0.0,-1.697518952...|0.023285709332822896|
|[0.0,-1.621534478...|0.023285709332822896|
|[0.0,-1.621534478...|0.023285709332822896|
|[0.0,-1.621534478...|0.023285709332822896|
|[0.0,-1.621534478...|0.023285709332822896|
|[0.0,-1.621534478...|0.02328570

In [88]:
bestModel = tuning_model.bestModel

In [89]:
best_lm_stage = bestModel.stages[-1]

In [93]:
print('Best regParam = {:}'.format(best_lm_stage.getRegParam()))
print('Best elasticNetParam = {:}'.format(best_lm_stage.getElasticNetParam()))

Best regParam = 0.05
Best elasticNetParam = 1.0


In [99]:
eval = RegressionEvaluator(labelCol = 'salary_package_lpa', predictionCol = 'prediction', metricName = 'rmse')
rmse = eval.evaluate(pred_tuning)
print('RMSE = {:.4f}'.format(rmse))
eval = RegressionEvaluator(labelCol = 'salary_package_lpa', predictionCol = 'prediction', metricName = 'mse')
mse = eval.evaluate(pred_tuning)
print('MSE = {:.4f}'.format(mse))
eval = RegressionEvaluator(labelCol = 'salary_package_lpa', predictionCol = 'prediction', metricName = 'mae')
mae = eval.evaluate(pred_tuning)
print('MAE = {:.4f}'.format(mae))
eval = RegressionEvaluator(labelCol = 'salary_package_lpa', predictionCol = 'prediction', metricName = 'r2')
r2 = eval.evaluate(pred_tuning)
print('R2 = {:.2f}%'.format(r2*100))

RMSE = 1.4289
MSE = 2.0419
MAE = 0.5192
R2 = 84.57%


In [98]:
eval = RegressionEvaluator(labelCol = "salary_package_lpa", predictionCol = 'prediction', metricName = 'rmse')
rmse = eval.evaluate(pred)
print('RMSE = {:.4f}'.format(rmse))
eval = RegressionEvaluator(labelCol = "salary_package_lpa", predictionCol = 'prediction', metricName = 'mse')
mse = eval.evaluate(pred)
print('MSE = {:.4f}'.format(mse))
eval = RegressionEvaluator(labelCol = "salary_package_lpa", predictionCol = 'prediction', metricName = 'mae')
mae = eval.evaluate(pred)
print('MAE = {:.4f}'.format(mae))
eval = RegressionEvaluator(labelCol = "salary_package_lpa", predictionCol = 'prediction', metricName = 'r2')
r2 = eval.evaluate(pred)
print('R2 = {:.2f}%'.format(r2*100))

RMSE = 1.4314
MSE = 2.0490
MAE = 0.5584
R2 = 84.51%
