In [1]:
!pip install findspark



In [2]:
import findspark
findspark.init("C:\spark")

In [3]:
import pyspark


In [4]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles


In [5]:
spark=SparkSession.builder.appName('HeartDisease').getOrCreate()


In [6]:
df=spark.read.csv(r"C:\Users\Admin\Downloads\heart_disease_data.csv",inferSchema=True,header=True)

In [7]:
df.show()

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
| 63|  1|  3|     145| 233|  1|      0|    150|    0|    2.3|    0|  0|   1|     1|
| 37|  1|  2|     130| 250|  0|      1|    187|    0|    3.5|    0|  0|   2|     1|
| 41|  0|  1|     130| 204|  0|      0|    172|    0|    1.4|    2|  0|   2|     1|
| 56|  1|  1|     120| 236|  0|      1|    178|    0|    0.8|    2|  0|   2|     1|
| 57|  0|  0|     120| 354|  0|      1|    163|    1|    0.6|    2|  0|   2|     1|
| 57|  1|  0|     140| 192|  0|      1|    148|    0|    0.4|    1|  0|   1|     1|
| 56|  0|  1|     140| 294|  0|      0|    153|    0|    1.3|    1|  0|   2|     1|
| 44|  1|  1|     120| 263|  0|      1|    173|    0|    0.0|    2|  0|   3|     1|
| 52|  1|  2|     172| 199|  1|      1|    162|    0|    0.5|    2|  0|   3|

In [8]:
# Selecting the columns which are required
# to train and test the model.
rm_columns = df.select(['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','target'])

In [9]:
result = rm_columns.na.drop()


In [10]:
# Again showing the data
result.show()

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
| 63|  1|  3|     145| 233|  1|      0|    150|    0|    2.3|    0|  0|   1|     1|
| 37|  1|  2|     130| 250|  0|      1|    187|    0|    3.5|    0|  0|   2|     1|
| 41|  0|  1|     130| 204|  0|      0|    172|    0|    1.4|    2|  0|   2|     1|
| 56|  1|  1|     120| 236|  0|      1|    178|    0|    0.8|    2|  0|   2|     1|
| 57|  0|  0|     120| 354|  0|      1|    163|    1|    0.6|    2|  0|   2|     1|
| 57|  1|  0|     140| 192|  0|      1|    148|    0|    0.4|    1|  0|   1|     1|
| 56|  0|  1|     140| 294|  0|      0|    153|    0|    1.3|    1|  0|   2|     1|
| 44|  1|  1|     120| 263|  0|      1|    173|    0|    0.0|    2|  0|   3|     1|
| 52|  1|  2|     172| 199|  1|      1|    162|    0|    0.5|    2|  0|   3|

In [11]:
from pyspark.ml.feature import VectorAssembler

In [12]:
# Vectorizing the data into a new column "features"
# which will be our input/features class
assembler = VectorAssembler(inputCols=['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal'],
 outputCol='features')

In [13]:
# Importing Pipeline and Model
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
log_reg = LogisticRegression(featuresCol='features',
 labelCol='target')

In [14]:
# Creating the pipeline
pipe = Pipeline(stages=[assembler, log_reg])

In [15]:
# Splitting the data into train and test
train_data, test_data = result.randomSplit([0.7, .3])
# Fitting the model on training data
fit_model = pipe.fit(train_data)
# Storing the results on test data
results = fit_model.transform(test_data)

In [19]:
from pyspark.sql.functions import col
results.select(col('probability'),col('prediction')).show()

+--------------------+----------+
|         probability|prediction|
+--------------------+----------+
|[0.00228104981057...|       1.0|
|[0.56421652270649...|       0.0|
|[0.33601695287007...|       1.0|
|[8.76250034892035...|       1.0|
|[0.98897385641755...|       0.0|
|[0.29211777942451...|       1.0|
|[0.18692946445206...|       1.0|
|[0.01568373211256...|       1.0|
|[0.17911388903636...|       1.0|
|[0.99333106609496...|       0.0|
|[0.22817800499759...|       1.0|
|[0.03848147643741...|       1.0|
|[0.09519030945194...|       1.0|
|[0.05723336318922...|       1.0|
|[0.96948910945343...|       0.0|
|[0.12613486651055...|       1.0|
|[0.99497921541299...|       0.0|
|[0.36646062840297...|       1.0|
|[0.00336782557021...|       1.0|
|[0.27441869243902...|       1.0|
+--------------------+----------+
only showing top 20 rows



In [20]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
res=BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='target')

In [21]:
ROC_AUC=res.evaluate(results)

In [22]:
ROC_AUC

0.7680275715800636

In [23]:
from __future__ import print_function
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler

In [24]:
spark=SparkSession.builder.appName('DecisionTree').getOrCreate()

In [25]:
fullPredictions=model.transform(testDF).cache()


NameError: name 'model' is not defined