In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 35 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 65.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=25433bf7f1a014cad8e040dea7101a9846a690ed36c54e31181de89224d038e5
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [2]:
# Loading Data

In [3]:
!wget https://raw.githubusercontent.com/techpiyushjoshi/BigData_Assignment/main/Steroid_Data.csv

--2022-05-06 21:37:41--  https://raw.githubusercontent.com/techpiyushjoshi/BigData_Assignment/main/Steroid_Data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8307 (8.1K) [text/plain]
Saving to: ‘Steroid_Data.csv’


2022-05-06 21:37:41 (37.1 MB/s) - ‘Steroid_Data.csv’ saved [8307/8307]



In [4]:
from pyspark.sql import SparkSession

In [5]:
my_assignment = SparkSession.builder.appName("steroid").getOrCreate()

In [6]:
data = my_assignment.read.csv("Steroid_Data.csv",header = True, inferSchema = True)

In [7]:
data.printSchema()

root
 |-- Steroid_Type: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Candidate_Age: integer (nullable = true)
 |-- Pulse_Level: string (nullable = true)
 |-- Body_Fat_Level: string (nullable = true)
 |-- Adrenaline_Level: double (nullable = true)



In [8]:
data.describe().select(['Summary','Candidate_Age','Gender','Pulse_Level','Body_Fat_Level','Adrenaline_Level']).show()

+-------+------------------+------+-----------+--------------+------------------+
|Summary|     Candidate_Age|Gender|Pulse_Level|Body_Fat_Level|  Adrenaline_Level|
+-------+------------------+------+-----------+--------------+------------------+
|  count|               274|   274|        274|           274|               274|
|   mean| 44.25912408759124|  null|       null|          null|16.309408759124082|
| stddev|16.604051117594206|  null|       null|          null| 7.128744241613704|
|    min|                15|Female|       HIGH|          HIGH|             6.269|
|    max|                74|  Male|     NORMAL|        NORMAL|            38.247|
+-------+------------------+------+-----------+--------------+------------------+



In [9]:
# Data Preprocessing

In [10]:
# String Indexing and OneHotEncoding

In [11]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [12]:
Gender_Indexer = StringIndexer(inputCol='Gender',outputCol='Gender_Indexed')
Gender_Encoder = OneHotEncoder(inputCol = 'Gender_Indexed',outputCol = 'Gender_Encoder')

In [13]:
Pulse_Level_Indexer = StringIndexer(inputCol='Pulse_Level',outputCol='Pulse_Level_Indexed')
Pulse_Level_Encoder = OneHotEncoder(inputCol = 'Pulse_Level_Indexed',outputCol = 'Pulse_Level_Encoder')

In [14]:
Body_Fat_Level_Indexer = StringIndexer(inputCol='Body_Fat_Level',outputCol='Body_Fat_Level_Indexed')
Body_Fat_Level_Encoder = OneHotEncoder(inputCol = 'Body_Fat_Level_Indexed',outputCol = 'Body_Fat_Level_Encoder')

In [15]:
Steroid_Indexer = StringIndexer(inputCol='Steroid_Type',outputCol='Steroid_Indexed')
Steroid_Encoder = OneHotEncoder(inputCol = 'Steroid_Indexed',outputCol = 'Steroid_Encoder')

In [16]:
indexed = Steroid_Indexer.fit(data).transform(data)
indexed.show()

+------------+------+-------------+-----------+--------------+----------------+---------------+
|Steroid_Type|Gender|Candidate_Age|Pulse_Level|Body_Fat_Level|Adrenaline_Level|Steroid_Indexed|
+------------+------+-------------+-----------+--------------+----------------+---------------+
|           E|Female|           23|       HIGH|          HIGH|          25.355|            0.0|
|           C|  Male|           47|        LOW|          HIGH|          13.093|            4.0|
|           C|  Male|           47|        LOW|          HIGH|          10.114|            4.0|
|           D|Female|           28|     NORMAL|          HIGH|           7.798|            1.0|
|           E|Female|           61|        LOW|          HIGH|          18.043|            0.0|
|           D|Female|           22|     NORMAL|          HIGH|           8.607|            1.0|
|           E|Female|           49|     NORMAL|          HIGH|          16.275|            0.0|
|           C|  Male|           41|     

In [17]:
final_data = indexed.drop('Steroid_Type')
final_data.show()

+------+-------------+-----------+--------------+----------------+---------------+
|Gender|Candidate_Age|Pulse_Level|Body_Fat_Level|Adrenaline_Level|Steroid_Indexed|
+------+-------------+-----------+--------------+----------------+---------------+
|Female|           23|       HIGH|          HIGH|          25.355|            0.0|
|  Male|           47|        LOW|          HIGH|          13.093|            4.0|
|  Male|           47|        LOW|          HIGH|          10.114|            4.0|
|Female|           28|     NORMAL|          HIGH|           7.798|            1.0|
|Female|           61|        LOW|          HIGH|          18.043|            0.0|
|Female|           22|     NORMAL|          HIGH|           8.607|            1.0|
|Female|           49|     NORMAL|          HIGH|          16.275|            0.0|
|  Male|           41|        LOW|          HIGH|          11.037|            4.0|
|  Male|           60|     NORMAL|          HIGH|          15.171|            0.0|
|  M

In [18]:
final_data.printSchema()

root
 |-- Gender: string (nullable = true)
 |-- Candidate_Age: integer (nullable = true)
 |-- Pulse_Level: string (nullable = true)
 |-- Body_Fat_Level: string (nullable = true)
 |-- Adrenaline_Level: double (nullable = true)
 |-- Steroid_Indexed: double (nullable = false)



In [21]:
# Now combining all variables except target variable into a single feature using Vector Assembler

In [22]:
from pyspark.ml.feature import VectorAssembler

In [23]:
Assembler = VectorAssembler(inputCols = ['Gender_Encoder','Candidate_Age','Pulse_Level_Encoder','Body_Fat_Level_Encoder','Adrenaline_Level'],outputCol = 'allfeatures')

In [24]:
# Standardizing values by Standard Scaling

In [25]:
from pyspark.ml.feature import StandardScaler

In [26]:
sc = StandardScaler(inputCol = 'allfeatures',outputCol = 'Standard_Features')

In [27]:
# Building the Model

In [29]:
# Creating the Pipeline

In [30]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

In [31]:
lr = LogisticRegression(featuresCol = 'Standard_Features',labelCol ='Steroid_Indexed')

In [32]:
my_pipeline = Pipeline(stages = [Gender_Indexer,Gender_Encoder,Pulse_Level_Indexer,Pulse_Level_Encoder,Body_Fat_Level_Indexer,Body_Fat_Level_Encoder,Assembler,sc,lr])

In [33]:
# Splitting the dataset into training and test

In [34]:
train_dataset, test_dataset = final_data.randomSplit([0.7,0.3])

In [35]:
lr_model = my_pipeline.fit(train_dataset)

In [36]:
lr_result = lr_model.transform(test_dataset)

In [37]:
# Evaluating the Model

In [38]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [39]:
evaluation = MulticlassClassificationEvaluator(predictionCol = 'prediction', labelCol = 'Steroid_Indexed')

In [40]:
predictionAndTarget_lr = lr_model.transform(final_data).select('Steroid_Indexed', 'prediction')

In [41]:
print("Accuracy score using LogisticRegression: "+str(evaluation.evaluate(predictionAndTarget_lr,{evaluation.metricName: "accuracy"})))

Accuracy score using LogisticRegression: 0.9963503649635036


In [42]:
print("Accuracy in Percentage : "+ str(evaluation.evaluate(predictionAndTarget_lr,{evaluation.metricName: "accuracy"})*100) + str(' %'))

Accuracy in Percentage : 99.63503649635037 %


In [None]:
# The Accuracy of our model is about 99.6 % which is really good