# Admission Prediction with PySpark

![](https://miro.medium.com/max/500/1*5C4UQznqEiN3D6Xutlgwlg.png)

In [None]:
! pip install pyspark



In [None]:
from pyspark.sql import SparkSession

spark= SparkSession.builder.appName("ml_project").getOrCreate()

In [None]:
spark

In [None]:
! git clone https://github.com/education454/admission_dataset

Cloning into 'admission_dataset'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (3/3), 5.60 KiB | 2.80 MiB/s, done.


In [None]:
df = spark.read.csv('admission_dataset/Admission_Predict_Ver1.1.csv', header=True, inferSchema=True)

In [None]:
df.show()

+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|Serial No|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|        1|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|        2|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|        3|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|        4|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|        5|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|        6|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|        7|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|        8|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|        9

In [None]:
shape = (df.count(), len(df.columns))

print(shape)

(500, 9)


In [None]:
df.printSchema()

root
 |-- Serial No: integer (nullable = true)
 |-- GRE Score: integer (nullable = true)
 |-- TOEFL Score: integer (nullable = true)
 |-- University Rating: integer (nullable = true)
 |-- SOP: double (nullable = true)
 |-- LOR: double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Research: integer (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [None]:
df.describe().show()

+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|summary|        Serial No|         GRE Score|      TOEFL Score|University Rating|               SOP|               LOR|              CGPA|          Research|    Chance of Admit|
+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|  count|              500|               500|              500|              500|               500|               500|               500|               500|                500|
|   mean|            250.5|           316.472|          107.192|            3.114|             3.374|             3.484| 8.576440000000003|              0.56| 0.7217399999999996|
| stddev|144.4818327679989|11.295148372354712|6.081867659564538|1.143511800759815|0.9910036207566072|0.92

In [None]:
df= df.drop('Serial No')

In [None]:
df.show(2)

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
+---------+-----------+-----------------+---+---+----+--------+---------------+
only showing top 2 rows



In [None]:
print(df[df['GRE Score'].isNull()].count())

0


In [None]:
for col in df.columns:
  print(col + ':' , df[df[col].isNull()].count())

GRE Score: 0
TOEFL Score: 0
University Rating: 0
SOP: 0
LOR: 0
CGPA: 0
Research: 0
Chance of Admit: 0


### Correlation Analysis & Feature Selection

In [None]:
print(df.stat.corr('GRE Score', 'Chance of Admit'))

0.8103506354632601


In [None]:
for col in df.columns:
  print(f"{col} is {round(df.stat.corr(col, 'Chance of Admit'),3)} correlated with the target variable Chance of Admit")

GRE Score is 0.81 correlated with the target variable Chance of Admit
TOEFL Score is 0.792 correlated with the target variable Chance of Admit
University Rating is 0.69 correlated with the target variable Chance of Admit
SOP is 0.684 correlated with the target variable Chance of Admit
LOR is 0.645 correlated with the target variable Chance of Admit
CGPA is 0.882 correlated with the target variable Chance of Admit
Research is 0.546 correlated with the target variable Chance of Admit
Chance of Admit is 1.0 correlated with the target variable Chance of Admit


In [None]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['GRE Score','TOEFL Score', 'CGPA'],outputCol='features' )

In [None]:
output_data = assembler.transform(df)
output_data.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|          features|
+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|[337.0,118.0,9.65]|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|[324.0,107.0,8.87]|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72| [316.0,104.0,8.0]|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|[322.0,110.0,8.67]|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|[314.0,103.0,8.21]|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|[330.0,115.0,9.34]|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75| [321.0,109.0,8.2]|
|      308

###  Linear Regression Model

In [None]:
from pyspark.ml.regression import LinearRegression

final_data = output_data.select('features', 'Chance of Admit')

In [None]:
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [None]:
train, test = final_data.randomSplit([0.7,0.3])


In [None]:
models = LinearRegression(featuresCol= 'features', labelCol='Chance of Admit')

model= models.fit(train)


22/01/03 20:32:05 WARN Instrumentation: [2d14172f] regParam is zero, which might cause numerical instability and overfitting.


In [None]:
print('Coefficients:', model.coefficients)

print('Intercept:', model.intercept)

Coefficients: [0.001757969801528378,0.003691646271629612,0.14267404326029562]
Intercept: -1.4532069561978138


In [None]:
summary = model.summary

In [None]:
print('RMSE :', summary.rootMeanSquaredError)

print('r2 :', summary.r2)

RMSE : 0.06320934079653275
r2 : 0.7818053312668196


###  Evaluate & Save the Model

In [None]:
predictions= model.transform(test)

In [None]:
predictions.show()

+------------------+---------------+-------------------+
|          features|Chance of Admit|         prediction|
+------------------+---------------+-------------------+
|[290.0,100.0,7.56]|           0.47| 0.5043846804562118|
|  [293.0,97.0,7.8]|           0.64| 0.5328254214283792|
| [294.0,93.0,7.36]|           0.46|0.45704022710885894|
| [295.0,99.0,7.57]|           0.37| 0.5109096236248272|
| [296.0,95.0,7.54]|           0.44|0.49362078704202816|
| [297.0,96.0,7.89]|           0.43| 0.5490063182562894|
| [298.0,92.0,7.88]|           0.51| 0.5345709625386965|
| [298.0,97.0,7.21]|           0.45|0.45743758491244657|
| [298.0,99.0,7.46]|           0.53| 0.5004893882707797|
|[298.0,101.0,7.69]|           0.53|  0.540687710763907|
|[298.0,105.0,8.54]|           0.69| 0.6767272326216764|
| [299.0,94.0,7.34]|           0.42|0.46666824152292463|
| [299.0,96.0,7.86]|           0.54| 0.5482420365615377|
|[299.0,102.0,8.62]|           0.56| 0.6788241870691396|
| [300.0,95.0,8.22]|           

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='Chance of Admit', metricName='r2')

print('r2 score:', evaluator.evaluate(predictions))

r2 score: 0.8401373849399525


In [None]:
model.save('model')

In [None]:
from pyspark.ml.regression import LinearRegressionModel

model_new= LinearRegressionModel.load('model')

In [None]:
model_new