In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.master("local[4]") \
.appName("PipelineOps") \
.config("spark.executor.memory","4g") \
.config("spark.driver.memory","2g") \
.getOrCreate()

In [2]:
df = spark.read \
.option("header","True") \
.option("inferSchema","True") \
.option("sep",",") \
.csv("D:\\Datasets\\simple_data.csv")

In [3]:
from pyspark.sql.functions import *

In [4]:
df1 = df.withColumn("ekonomik_durum", 
when(col("aylik_gelir") > 7000, "iyi").otherwise("kötü")
         
)

df1.toPandas().head()

Unnamed: 0,sirano,isim,yas,meslek,sehir,aylik_gelir,ekonomik_durum
0,1,Cemal,35,Isci,Ankara,3500,kötü
1,2,Ceyda,42,Memur,Kayseri,4200,kötü
2,3,Timur,30,Müzisyen,Istanbul,9000,iyi
3,4,Burcu,29,Pazarlamaci,Ankara,4200,kötü
4,5,Yasemin,23,Pazarlamaci,Bursa,4800,kötü


In [5]:
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler, StandardScaler

In [19]:
meslek_indexer = StringIndexer() \
.setInputCol("meslek") \
.setOutputCol("meslek_index") \
.setHandleInvalid("skip")

In [20]:
sehir_indexer = StringIndexer() \
.setInputCol("sehir") \
.setOutputCol("sehir_index") \
.setHandleInvalid("skip")

In [8]:
encoder = OneHotEncoderEstimator() \
.setInputCols(["meslek_index","sehir_index"]) \
.setOutputCols(["meslek_encoded","sehir_encoded"])

In [9]:
assembler = VectorAssembler() \
.setInputCols(["yas","aylik_gelir","meslek_encoded","sehir_encoded"]) \
.setOutputCol("vectorized_features")

In [10]:
label_indexer = StringIndexer() \
.setInputCol("ekonomik_durum") \
.setOutputCol("label")

In [11]:
scaler = StandardScaler() \
.setInputCol("vectorized_features") \
.setOutputCol("features")

In [13]:
from pyspark.ml.classification import LogisticRegression

In [14]:
lr_object = LogisticRegression() \
.setFeaturesCol("features") \
.setLabelCol("label") \
.setPredictionCol("prediction")

In [12]:
train_df, test_df = df1.randomSplit([0.8, 0.2], seed=142)

In [15]:
from pyspark.ml import Pipeline

In [21]:
pipeline_nesnesi = Pipeline() \
.setStages([meslek_indexer, sehir_indexer, encoder, assembler, label_indexer, scaler, lr_object])

In [22]:
pipeline_model = pipeline_nesnesi.fit(train_df)

In [23]:
pipeline_model.transform(test_df).select("label","prediction").toPandas().head()

Unnamed: 0,label,prediction
0,1.0,1.0
1,0.0,0.0
2,1.0,1.0
