In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder \
.master("local[4]") \
.appName("ClassificationWihthIris") \
.config("spark.driver.memory","2g") \
.config("spark.execute.memory","4g") \
.getOrCreate()

# Veri Setini Okuma

In [5]:
df = spark.read.format("csv") \
.option("header",True) \
.option("sep",",") \
.option("inferSchema",True) \
.load("/home/taha/Downloads/iris.csv")

In [7]:
df.toPandas().head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


# Veri Kesfi

In [9]:
df.describe().toPandas().head()

Unnamed: 0,summary,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,count,150.0,150.0,150.0,150.0,150
1,mean,5.843333333333335,3.0540000000000007,3.758666666666669,1.1986666666666672,
2,stddev,0.8280661279778637,0.4335943113621737,1.764420419952262,0.7631607417008414,
3,min,4.3,2.0,1.0,0.1,Iris-setosa
4,max,7.9,4.4,6.9,2.5,Iris-virginica


In [17]:
import pyspark.sql.functions as f

In [18]:
df.groupBy("Species").agg(f.count("*").alias("sayi")).show()

+---------------+----+
|        Species|sayi|
+---------------+----+
| Iris-virginica|  50|
|    Iris-setosa|  50|
|Iris-versicolor|  50|
+---------------+----+



In [19]:
df.printSchema()

root
 |-- SepalLengthCm: double (nullable = true)
 |-- SepalWidthCm: double (nullable = true)
 |-- PetalLengthCm: double (nullable = true)
 |-- PetalWidthCm: double (nullable = true)
 |-- Species: string (nullable = true)



# Veri Temizligi Ve On Hazilirgi

In [27]:
# Kategorik Degisiklikler icermediginden
# feature icinStringIndexver ve OneHotEncoder 
# kullanmamiza gerek yoktur
# Fakat Hedef Degisken Icin kulanacagiz

# StringIndexer

In [22]:
from pyspark.ml.feature import StringIndexer

In [23]:
indexer = StringIndexer().setHandleInvalid("skip") \
.setInputCol("Species") \
.setOutputCol("label")

In [24]:
indexerDF = indexer.fit(df).transform(df)

In [25]:
indexerDF.toPandas().head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,label
0,5.1,3.5,1.4,0.2,Iris-setosa,0.0
1,4.9,3.0,1.4,0.2,Iris-setosa,0.0
2,4.7,3.2,1.3,0.2,Iris-setosa,0.0
3,4.6,3.1,1.5,0.2,Iris-setosa,0.0
4,5.0,3.6,1.4,0.2,Iris-setosa,0.0


# VectorAssembler

In [28]:
from pyspark.ml.feature import VectorAssembler

In [29]:
assembler = VectorAssembler() \
.setInputCols(["SepalLengthCm","SepalWidthCm","PetalLengthCm","PetalWidthCm"]) \
.setOutputCol("features")

In [30]:
assembler_df = assembler.transform(indexerDF)

In [31]:
assembler_df.toPandas().head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,label,features
0,5.1,3.5,1.4,0.2,Iris-setosa,0.0,"[5.1, 3.5, 1.4, 0.2]"
1,4.9,3.0,1.4,0.2,Iris-setosa,0.0,"[4.9, 3.0, 1.4, 0.2]"
2,4.7,3.2,1.3,0.2,Iris-setosa,0.0,"[4.7, 3.2, 1.3, 0.2]"
3,4.6,3.1,1.5,0.2,Iris-setosa,0.0,"[4.6, 3.1, 1.5, 0.2]"
4,5.0,3.6,1.4,0.2,Iris-setosa,0.0,"[5.0, 3.6, 1.4, 0.2]"


In [32]:
# Nitelikler ayni olcekte oldugundan standartasyona gerek yok

# Veri Setini Bolme

In [33]:
train_df , test_df = assembler_df.randomSplit([0.8,0.2],seed=142)

# Model Olusturma

In [34]:
from pyspark.ml.classification import LogisticRegression

In [36]:
# siniflandirici nesnesini olusturma
logreg_obj = LogisticRegression() \
.setLabelCol("label") \
.setFeaturesCol("features")

In [37]:
# Modeli Egitme
logreg_model = logreg_obj.fit(train_df)

In [38]:
transformed_df = logreg_model.transform(test_df)

In [39]:
transformed_df.toPandas().head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,label,features,rawPrediction,probability,prediction
0,4.4,3.0,1.3,0.2,Iris-setosa,0.0,"[4.4, 3.0, 1.3, 0.2]","[14604.377893842138, -2761.3173438072095, -118...","[1.0, 0.0, 0.0]",0.0
1,4.6,3.1,1.5,0.2,Iris-setosa,0.0,"[4.6, 3.1, 1.5, 0.2]","[14632.728901273445, -2839.341524260714, -1179...","[1.0, 0.0, 0.0]",0.0
2,4.6,3.6,1.0,0.2,Iris-setosa,0.0,"[4.6, 3.6, 1.0, 0.2]","[20348.465372786737, -5208.036002792378, -1514...","[1.0, 0.0, 0.0]",0.0
3,4.7,3.2,1.3,0.2,Iris-setosa,0.0,"[4.7, 3.2, 1.3, 0.2]","[15819.80674976183, -3254.5791800061547, -1256...","[1.0, 0.0, 0.0]",0.0
4,4.9,2.4,3.3,1.0,Iris-versicolor,1.0,"[4.9, 2.4, 3.3, 1.0]","[-1046.3786273186568, 2375.2529407912994, -132...","[0.0, 1.0, 0.0]",1.0


# MODEL DEGERLENDİRME

In [40]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [46]:
evaluator = MulticlassClassificationEvaluator() \
.setLabelCol("label") \
.setPredictionCol("prediction") \
.setMetricName("accuracy")

In [47]:
accuary = evaluator.evaluate(transformed_df)

In [48]:
accuary

0.9393939393939394

In [49]:
# %93 oraninda test_df de dogru tahmin etmis