#Installing Dependencies & Initiating a New Spark Session



In [None]:
#install pyspark
#! pip install pyspark

In [None]:
#creating a sparksession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("spark").getOrCreate()

#Clone & Explore dataset

In [None]:
#create a spark dataframe
df = spark.read.csv("diabetes.csv",header=True, inferSchema=True)

In [None]:
#displaying the dataframe
df.show()


In [None]:
#printing the schema
df.printSchema()

In [None]:
#count the total no. of diabetic and non-diabetic class.
print((df.count(),len(df.columns)))
df.groupBy('Outcome').count().show()

In [None]:
#summary statistics.
df.describe().show()

#Data Preparation

In [None]:
#checking for null values
for col in df.columns:
    print(col+":",df[df[col].isNull()].count())

In [None]:
#check for any unnecessary values
def count_zeros():
    columns_list = ['Glucose', 'BloodPressure','SkinThickness','Insulin','BMI']
    for i in columns_list:
        print(i+":",df[df[i]==0].count())


In [None]:
count_zeros()

In [None]:
#calculate and replace the unnecessary values by the mean value of the respective column
from pyspark.sql.functions import *
for i in df.columns[1:6]:
    data= df.agg({i:'mean'}).first()[0]
    print(f'mean value for {i} is {int(data)}')

df=df.withColumn(i,when(df[i]==0,int(data)).otherwise(df[i]))

In [None]:
#display the dataframe 
df.show()

#Performing Correlation Analysis & Feature Selection

In [None]:
#find the correlation between the set of input & output variables
for i in df.columns:
    print(f'Correlation to outcome for {i} is {df.stat.corr("Outcome",i)}')

In [None]:
#feature selection
from pyspark.ml.feature import VectorAssembler
assembler= VectorAssembler(inputCols=['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age'],outputCol='features')
output_data= assembler.transform(df)

In [None]:
#print the schema
output_data.printSchema()

In [None]:
#display dataframe
output_data.show()

#Split Dataset & Build the Model

In [None]:
#create final data
from pyspark.ml.classification import LogisticRegression
final_data = output_data.select('features','Outcome')

In [None]:
#print schema of final data
final_data.printSchema()

In [None]:
#split the dataset and build the model
train,test= final_data.randomSplit([0.7,0.3])
models= LogisticRegression(labelCol='Outcome')
model= models.fit(train)

In [None]:
#summary of the model
summary = model.summary
summary.predictions.describe().show()

#Evaluate and Save the Model

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
predictions = model.evaluate(test)

In [None]:
predictions.predictions.show(10)

In [None]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='Outcome')
evaluator.evaluate(model.transform(test))

In [None]:
# save model
#model.write().overwrite().save(D:/OneDrive - Quadratic Insights Pvt Ltd/Documents\Training\Pyspark\Diabetes_Prediction) 
model.save("model")

In [None]:
hadoop --version

In [None]:
# load saved model back to the environment
from pyspark.ml.classification import LogisticRegressionModel
model = LogisticRegressionModel.load('model')

#Prediction on New Data with the saved model

In [None]:
#create a new spark dataframe
test_df = spark.read.csv("/content/diabetes_dataset/new_test.csv", header= True, inferSchema=True)

In [None]:
#print the schema
test_df.printSchema()

In [None]:
#create an additional feature merged column 
test_data = assembler.transform(test_df)

In [None]:
#print the schema
test_data.printSchema()

In [None]:
#use model to make predictions
results = model.transform(test_data)
results.printSchema()

In [None]:
#display the predictions
results.select('features', 'prediction').show()