In [1]:
# !pip install pyspark

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [3]:
# Creating a SparkSession
spark = SparkSession.builder.appName("DiabetesPrediction").getOrCreate()

# Set the log level to WARN
spark.sparkContext.setLogLevel("WARN")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/17 13:08:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Loading the dataset into a DataFrame
data = spark.read.csv("medical_info.csv", header=True, inferSchema=True)

In [5]:
# Inspecting the data
data.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|31.0|                   0.248| 26|      1|


In [6]:
# Preprocessing the data
# Creating a feature vector by combining the input features
assembler = VectorAssembler(inputCols=data.columns[:-1], outputCol="features")
data = assembler.transform(data)

In [7]:
# Splitting the data into training and testing sets
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)

In [8]:
# Creating an instance of LogisticRegression model
lr = LogisticRegression(labelCol="Outcome", featuresCol="features")

In [9]:
# Training the logistic regression model
model = lr.fit(train_data)

23/05/17 13:09:11 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/05/17 13:09:11 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


In [10]:
# Making predictions on the test data
predictions = model.transform(test_data)

In [11]:
# Evaluating the model
evaluator = BinaryClassificationEvaluator(labelCol="Outcome")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

Accuracy: 0.8509259259259259


In [12]:
# Closing the SparkSession
spark.stop()