In [23]:
import findspark
findspark.init("D:/spark")
from pyspark.sql import SparkSession

In [24]:
spark = SparkSession.builder \
.master("local[4]") \
.appName("Classification with iris") \
.config("spark.driver.memory","2g") \
.config("spark.executor.memory","4g") \
.getOrCreate()

In [25]:
df = spark.read.format("csv") \
.option("header",True) \
.option("inferSchema", True) \
.option("sep",",") \
.load("D:/Datasets/iris.csv")

In [26]:
df.limit(5).toPandas().head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [27]:
df.describe().toPandas().head()

Unnamed: 0,summary,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,count,150.0,150.0,150.0,150.0,150
1,mean,5.843333333333335,3.0540000000000007,3.758666666666669,1.1986666666666672,
2,stddev,0.8280661279778637,0.4335943113621737,1.764420419952262,0.7631607417008414,
3,min,4.3,2.0,1.0,0.1,Iris-setosa
4,max,7.9,4.4,6.9,2.5,Iris-virginica


In [28]:
import pyspark.sql.functions as f
df.groupBy("Species").agg(f.count("*").alias("sayi")).show()

+---------------+----+
|        Species|sayi|
+---------------+----+
| Iris-virginica|  50|
|    Iris-setosa|  50|
|Iris-versicolor|  50|
+---------------+----+



# labelIndexer Aşaması

In [29]:
from pyspark.ml.feature import StringIndexer

label_df = df
output_indexer = StringIndexer(inputCol="Species", outputCol="label", handleInvalid="skip")
label_df = output_indexer.fit(label_df).transform(label_df)

In [30]:
label_df.limit(5).toPandas().head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,label
0,5.1,3.5,1.4,0.2,Iris-setosa,0.0
1,4.9,3.0,1.4,0.2,Iris-setosa,0.0
2,4.7,3.2,1.3,0.2,Iris-setosa,0.0
3,4.6,3.1,1.5,0.2,Iris-setosa,0.0
4,5.0,3.6,1.4,0.2,Iris-setosa,0.0


# VectorAssembler Aşaması

In [31]:
df.columns

['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species']

In [32]:
from pyspark.ml.feature import VectorAssembler
assembler_df = label_df
assembler = VectorAssembler(inputCols=['SepalLengthCm', 'SepalWidthCm', 
                                       'PetalLengthCm', 'PetalWidthCm'], 
                            outputCol="features")
assembler_df = assembler.transform(assembler_df)

In [33]:
assembler_df.limit(5).toPandas().head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,label,features
0,5.1,3.5,1.4,0.2,Iris-setosa,0.0,"[5.1, 3.5, 1.4, 0.2]"
1,4.9,3.0,1.4,0.2,Iris-setosa,0.0,"[4.9, 3.0, 1.4, 0.2]"
2,4.7,3.2,1.3,0.2,Iris-setosa,0.0,"[4.7, 3.2, 1.3, 0.2]"
3,4.6,3.1,1.5,0.2,Iris-setosa,0.0,"[4.6, 3.1, 1.5, 0.2]"
4,5.0,3.6,1.4,0.2,Iris-setosa,0.0,"[5.0, 3.6, 1.4, 0.2]"


## Veriyi eğitim ve test olarak bölme

In [34]:
train_df, test_df = assembler_df.randomSplit([0.8, 0.2], seed=142)

# Model Oluşturma

In [35]:
from pyspark.ml.classification import LogisticRegression
logreg_obj = LogisticRegression(featuresCol="features", labelCol="label")
logreg_model = logreg_obj.fit(train_df)

In [36]:
test_result_df = logreg_model.transform(test_df)

In [37]:
test_result_df.select("label","prediction","probability").limit(5).toPandas().head()

Unnamed: 0,label,prediction,probability
0,0.0,0.0,"[1.0, 0.0, 0.0]"
1,0.0,0.0,"[1.0, 0.0, 0.0]"
2,0.0,0.0,"[1.0, 0.0, 0.0]"
3,0.0,0.0,"[1.0, 0.0, 0.0]"
4,1.0,1.0,"[0.0, 1.0, 0.0]"


# Model Değerlendirme

In [41]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
accuracy = evaluator.evaluate(test_result_df)
print(accuracy)

0.9393939393939394
