# **1. Install and import necessary libraries** #

In [1]:
# Install pyspark
!pip install pyspark



In [2]:
# Import Spark Sesion
from pyspark.sql import SparkSession

# Import libraries for Logistic Regression
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import  LogisticRegression

# Import libraries for evaluation
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
# Create Spark Sesion
spark = SparkSession.builder.appName("LogisticRegessionwithSpark").getOrCreate()
spark.version

'3.5.0'

# **2. Read dataset into dataframe and Explore** #

In [4]:
# Connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Read dataset
df_drybeans = spark.read.csv("/content/drive/MyDrive/Machine_Learning_wtih_Spark/dataset/drybeans.csv", header = True, inferSchema = True)

In [6]:
# Show schema of dataframe
df_drybeans.printSchema()

root
 |-- Area: integer (nullable = true)
 |-- Perimeter: double (nullable = true)
 |-- MajorAxisLength: double (nullable = true)
 |-- MinorAxisLength: double (nullable = true)
 |-- AspectRation: double (nullable = true)
 |-- Eccentricity: double (nullable = true)
 |-- ConvexArea: integer (nullable = true)
 |-- EquivDiameter: double (nullable = true)
 |-- Extent: double (nullable = true)
 |-- Solidity: double (nullable = true)
 |-- roundness: double (nullable = true)
 |-- Compactness: double (nullable = true)
 |-- ShapeFactor1: double (nullable = true)
 |-- ShapeFactor2: double (nullable = true)
 |-- ShapeFactor3: double (nullable = true)
 |-- ShapeFactor4: double (nullable = true)
 |-- Class: string (nullable = true)



In [7]:
# Show first 5 rows
df_drybeans.head(5)

[Row(Area=28395, Perimeter=610.291, MajorAxisLength=208.1781167, MinorAxisLength=173.888747, AspectRation=1.197191424, Eccentricity=0.549812187, ConvexArea=28715, EquivDiameter=190.1410973, Extent=0.763922518, Solidity=0.988855999, roundness=0.958027126, Compactness=0.913357755, ShapeFactor1=0.007331506, ShapeFactor2=0.003147289, ShapeFactor3=0.834222388, ShapeFactor4=0.998723889, Class='SEKER'),
 Row(Area=28734, Perimeter=638.018, MajorAxisLength=200.5247957, MinorAxisLength=182.7344194, AspectRation=1.097356461, Eccentricity=0.411785251, ConvexArea=29172, EquivDiameter=191.2727505, Extent=0.783968133, Solidity=0.984985603, roundness=0.887033637, Compactness=0.953860842, ShapeFactor1=0.006978659, ShapeFactor2=0.003563624, ShapeFactor3=0.909850506, ShapeFactor4=0.998430331, Class='SEKER'),
 Row(Area=29380, Perimeter=624.11, MajorAxisLength=212.8261299, MinorAxisLength=175.9311426, AspectRation=1.209712656, Eccentricity=0.562727317, ConvexArea=29690, EquivDiameter=193.4109041, Extent=0.

In [8]:
# Print value counts for column "Class"
df_drybeans.groupBy('Class').count().orderBy('count').show()

+--------+-----+
|   Class|count|
+--------+-----+
|  BOMBAY|  522|
|BARBUNYA| 1322|
|    CALI| 1630|
|   HOROZ| 1928|
|   SEKER| 2027|
|    SIRA| 2636|
|DERMASON| 3546|
+--------+-----+



In [9]:
# Convert Class column from string to numerical values
indexer = StringIndexer(inputCol="Class", outputCol="label")
df_drybeans = indexer.fit(df_drybeans).transform(df_drybeans)

In [10]:
# 0: Dermason, 1: Sira, 2: Seker, 3: Horoz, 4: Cali, 5: Barbunya, 6: Bombay
df_drybeans.groupby("label").count().orderBy("count").show()

+-----+-----+
|label|count|
+-----+-----+
|  6.0|  522|
|  5.0| 1322|
|  4.0| 1630|
|  3.0| 1928|
|  2.0| 2027|
|  1.0| 2636|
|  0.0| 3546|
+-----+-----+



# **3. Identify label column and index columns** #

In [11]:
# Prepare feature vector
assembler = VectorAssembler(inputCols=["Area","Perimeter","Solidity","roundness","Compactness"], outputCol="features")
beans_transformed_data = assembler.transform(df_drybeans)

In [12]:
beans_transformed_data.select("features","label").show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[28395.0,610.291,...|  2.0|
|[28734.0,638.018,...|  2.0|
|[29380.0,624.11,0...|  2.0|
|[30008.0,645.884,...|  2.0|
|[30140.0,620.134,...|  2.0|
|[30279.0,634.927,...|  2.0|
|[30477.0,670.033,...|  2.0|
|[30519.0,629.727,...|  2.0|
|[30685.0,635.681,...|  2.0|
|[30834.0,631.934,...|  2.0|
|[30917.0,640.765,...|  2.0|
|[31091.0,638.558,...|  2.0|
|[31107.0,640.594,...|  2.0|
|[31158.0,642.626,...|  2.0|
|[31158.0,641.105,...|  2.0|
|[31178.0,636.888,...|  2.0|
|[31202.0,644.454,...|  2.0|
|[31203.0,639.782,...|  2.0|
|[31272.0,638.666,...|  2.0|
|[31335.0,635.011,...|  2.0|
+--------------------+-----+
only showing top 20 rows



# **4. Split data into Tran and Test data** #

In [13]:
(training_data, testing_data) = beans_transformed_data.randomSplit([0.7, 0.3], seed = 42)

In [14]:
training_data.show()

+-----+---------+---------------+---------------+------------+------------+----------+-------------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+--------+-----+--------------------+
| Area|Perimeter|MajorAxisLength|MinorAxisLength|AspectRation|Eccentricity|ConvexArea|EquivDiameter|     Extent|   Solidity|  roundness|Compactness|ShapeFactor1|ShapeFactor2|ShapeFactor3|ShapeFactor4|   Class|label|            features|
+-----+---------+---------------+---------------+------------+------------+----------+-------------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+--------+-----+--------------------+
|20420|  524.932|     183.601165|    141.8862155| 1.294002834| 0.634654708|     20684|  161.2437642|0.790186518|0.987236511|0.931235461|0.878228437| 0.008991242| 0.003299358| 0.771285188|  0.99804522|DERMASON|  0.0|[20420.0,524.932,...|
|20464|  528.408|     191.249312|    136.3684624| 1.

In [15]:
testing_data.show()

+-----+---------+---------------+---------------+------------+------------+----------+-------------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+--------+-----+--------------------+
| Area|Perimeter|MajorAxisLength|MinorAxisLength|AspectRation|Eccentricity|ConvexArea|EquivDiameter|     Extent|   Solidity|  roundness|Compactness|ShapeFactor1|ShapeFactor2|ShapeFactor3|ShapeFactor4|   Class|label|            features|
+-----+---------+---------------+---------------+------------+------------+----------+-------------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+--------+-----+--------------------+
|20548|  524.736|    183.9652515|    142.6723878| 1.289424354| 0.631298586|     20825|  161.7483421|0.759686483|0.986698679|0.937772948| 0.87923312| 0.008952952| 0.003300366|  0.77305088| 0.996790631|DERMASON|  0.0|[20548.0,524.736,...|
|21101|  533.701|    185.3819214|    146.0322578| 1.

# **5. Build and Training Logistic Model** #

In [16]:
# Create instance of Logistic Regression
lr = LogisticRegression(featuresCol = "features", labelCol = "label")

# Traning model
model = lr.fit(training_data)

# **6. Evaluate model** #

In [17]:
# Make predict on test data
predictions = model.transform(testing_data)

# Show predict
predictions.show()

+-----+---------+---------------+---------------+------------+------------+----------+-------------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+--------+-----+--------------------+--------------------+--------------------+----------+
| Area|Perimeter|MajorAxisLength|MinorAxisLength|AspectRation|Eccentricity|ConvexArea|EquivDiameter|     Extent|   Solidity|  roundness|Compactness|ShapeFactor1|ShapeFactor2|ShapeFactor3|ShapeFactor4|   Class|label|            features|       rawPrediction|         probability|prediction|
+-----+---------+---------------+---------------+------------+------------+----------+-------------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+--------+-----+--------------------+--------------------+--------------------+----------+
|20548|  524.736|    183.9652515|    142.6723878| 1.289424354| 0.631298586|     20825|  161.7483421|0.759686483|0.986698679|0.9377

In [18]:
# Evaluate model performance
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = ", accuracy)

Accuracy =  0.9140055318078953


In [19]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
precision = evaluator.evaluate(predictions)
print("Precision = ", precision)

Precision =  0.914542510879638


In [20]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator.evaluate(predictions)
print("Recall =", recall)

Recall = 0.9140055318078953


In [21]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(predictions)
print("F1 score = ", f1_score)

F1 score =  0.9141223602932477


# **7. Stop Spark Session** #

In [22]:
spark.stop()