In [4]:
from __future__ import print_function
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [None]:
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("LinearRegWithSpark22")\
        .getOrCreate()

In [None]:
dataset = spark.read.csv("Admission_Prediction.csv",header=True)

In [None]:
dataset.show()

In [None]:
dataset.printSchema()

In [None]:
from pyspark.sql.functions import col
new_data = dataset.select(*(col(c).cast("float").alias(c) for c in dataset.columns))

In [None]:
new_data.printSchema()

In [None]:
from pyspark.sql.functions import col, count, isnan, when

In [None]:
#let's check for missing values

In [None]:
#checking for null ir nan type values in our columns
new_data.select([count(when(col(c).isNull(), c)).alias(c) for c in new_data.columns]).show()

In [None]:
from pyspark.ml.feature import Imputer

In [None]:
imputer = Imputer(inputCols=["GRE Score", "TOEFL Score","University Rating"], 
                  outputCols=["GRE Score", "TOEFL Score","University Rating"])
model = imputer.fit(new_data)

imputed_data = model.transform(new_data)

In [None]:
#checking for null ir nan type values in our columns
imputed_data.select([count(when(col(c).isNull(), c)).alias(c) for c in imputed_data.columns]).show()

In [None]:
features = imputed_data.drop('Chance of Admit')

In [None]:
#let's assemble our features together using vectorAssembler
assembler = VectorAssembler(
    inputCols=features.columns,
    outputCol="features")

In [None]:
output = assembler.transform(imputed_data)

In [None]:
output= output.select("features", "Chance of Admit")

In [None]:
train_df,test_df = output.randomSplit([0.7, 0.3])

In [None]:
train_df.show()
test_df.show()

In [None]:
lin_reg = LinearRegression(featuresCol = 'features', labelCol='Chance of Admit')
linear_model = lin_reg.fit(train_df)


In [None]:
print("Coefficients: " + str(linear_model.coefficients))
print("Intercept: " + str(linear_model.intercept))

In [None]:
trainSummary = linear_model.summary
print("RMSE: %f" % trainSummary.rootMeanSquaredError)
print("r2: %f" % trainSummary.r2)

In [None]:
# prediction

predictions = linear_model.transform(test_df)
predictions.select("prediction","Chance of Admit","features").show()

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
pred_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="Chance of Admit",metricName="r2")
print("R Squared (R2) on test data = %g" % pred_evaluator.evaluate(predictions))

In [None]:
spark.stop()