In [None]:
import pandas as pd

In [None]:
!pip install pyspark
from pyspark.sql.functions import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import  StringIndexer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from pyspark.sql import SparkSession

In [None]:
df = pd.read_csv("kredit.csv")
df.head()

Unnamed: 0,OCCUPATION,SALARY,INSTALLMENT,TENOR,USIA,MERK,STATUS
0,O3,1790400,440000,35,46,KAWASAKI,LUNAS
1,O3,1650000,506000,35,51,HONDA,TARIKAN
2,O3,1942000,435000,36,44,HONDA,LUNAS
3,O2,1104000,364000,36,32,HONDA,LUNAS
4,O2,773452,175000,12,31,HONDA,LUNAS


In [None]:
spark = SparkSession.builder.master("local[*]").getOrCreate() #Sets the Spark master URL to run locally.

In [None]:
kredit_df = spark.createDataFrame(df)  #Create DataFrame
kredit_df.show(5)

+----------+-------+------------+------+-----+--------+-------+
|OCCUPATION| SALARY| INSTALLMENT| TENOR| USIA|    MERK| STATUS|
+----------+-------+------------+------+-----+--------+-------+
|        O3|1790400|      440000|    35|   46|KAWASAKI|  LUNAS|
|        O3|1650000|      506000|    35|   51|   HONDA|TARIKAN|
|        O3|1942000|      435000|    36|   44|   HONDA|  LUNAS|
|        O2|1104000|      364000|    36|   32|   HONDA|  LUNAS|
|        O2| 773452|      175000|    12|   31|   HONDA|  LUNAS|
+----------+-------+------------+------+-----+--------+-------+
only showing top 5 rows



In [None]:
indexers = [StringIndexer(inputCol="OCCUPATION", outputCol = "OCCUPATION_index"),
           StringIndexer(inputCol=" SALARY", outputCol = "SALARY_index"),
           StringIndexer(inputCol=" INSTALLMENT", outputCol = "INSTALLMENT_index"),
           StringIndexer(inputCol=" TENOR", outputCol = "TENOR_index"),
           StringIndexer(inputCol=" USIA", outputCol = "USIA_index"),
           StringIndexer(inputCol=" MERK", outputCol = "MERK_index"),
           StringIndexer(inputCol=" STATUS", outputCol = "label")]

In [None]:
pipeline = Pipeline(stages=indexers)
indexed_kredit_df = pipeline.fit(kredit_df).transform(kredit_df)

In [None]:
indexed_kredit_df.show(5,False)

+----------+-------+------------+------+-----+--------+-------+----------------+------------+-----------------+-----------+----------+----------+-----+
|OCCUPATION| SALARY| INSTALLMENT| TENOR| USIA| MERK   | STATUS|OCCUPATION_index|SALARY_index|INSTALLMENT_index|TENOR_index|USIA_index|MERK_index|label|
+----------+-------+------------+------+-----+--------+-------+----------------+------------+-----------------+-----------+----------+----------+-----+
|O3        |1790400|440000      |35    |46   |KAWASAKI|LUNAS  |0.0             |1090.0      |10.0             |1.0        |16.0      |1.0       |0.0  |
|O3        |1650000|506000      |35    |51   |HONDA   |TARIKAN|0.0             |79.0        |127.0            |1.0        |27.0      |2.0       |1.0  |
|O3        |1942000|435000      |36    |44   |HONDA   |LUNAS  |0.0             |1172.0      |35.0             |0.0        |13.0      |2.0       |0.0  |
|O2        |1104000|364000      |36    |32   |HONDA   |LUNAS  |1.0             |94.0    

In [None]:
vectorAssembler = VectorAssembler(inputCols = ["OCCUPATION_index", "MERK_index"],outputCol = "features")
vindexed_kredit_df = vectorAssembler.transform(indexed_kredit_df)

In [None]:
vindexed_kredit_df.show(5, False)

+----------+-------+------------+------+-----+--------+-------+----------------+------------+-----------------+-----------+----------+----------+-----+---------+
|OCCUPATION| SALARY| INSTALLMENT| TENOR| USIA| MERK   | STATUS|OCCUPATION_index|SALARY_index|INSTALLMENT_index|TENOR_index|USIA_index|MERK_index|label|features |
+----------+-------+------------+------+-----+--------+-------+----------------+------------+-----------------+-----------+----------+----------+-----+---------+
|O3        |1790400|440000      |35    |46   |KAWASAKI|LUNAS  |0.0             |1090.0      |10.0             |1.0        |16.0      |1.0       |0.0  |[0.0,1.0]|
|O3        |1650000|506000      |35    |51   |HONDA   |TARIKAN|0.0             |79.0        |127.0            |1.0        |27.0      |2.0       |1.0  |[0.0,2.0]|
|O3        |1942000|435000      |36    |44   |HONDA   |LUNAS  |0.0             |1172.0      |35.0             |0.0        |13.0      |2.0       |0.0  |[0.0,2.0]|
|O2        |1104000|364000  

# Naive Bayes Calssification


In [None]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
splits = vindexed_kredit_df.randomSplit([0.6,0.4], 42)    # optional value 42 is seed for sampling
train_df = splits[0]
test_df = splits[1]

In [None]:
# Apply the Naive bayes classifier
nb = NaiveBayes(modelType="multinomial")

In [None]:
# train the model
nbmodel = nb.fit(train_df)

In [None]:
# select example rows to display.
predictions_df = nbmodel.transform(test_df)
predictions_df.show(5)

+----------+-------+------------+------+-----+--------+-------+----------------+------------+-----------------+-----------+----------+----------+-----+---------+--------------------+--------------------+----------+
|OCCUPATION| SALARY| INSTALLMENT| TENOR| USIA|    MERK| STATUS|OCCUPATION_index|SALARY_index|INSTALLMENT_index|TENOR_index|USIA_index|MERK_index|label| features|       rawPrediction|         probability|prediction|
+----------+-------+------------+------+-----+--------+-------+----------------+------------+-----------------+-----------+----------+----------+-----+---------+--------------------+--------------------+----------+
|        O1| 774700|      459000|    35|   41|KAWASAKI|  LUNAS|             2.0|      1486.0|             47.0|        1.0|       6.0|       1.0|  0.0|[2.0,1.0]|[-2.0072585577028...|[0.98087875099963...|       0.0|
|        O1| 840400|      502000|    23|   58|  SUZUKI|TARIKAN|             2.0|      1501.0|             26.0|        4.0|      30.0|      

In [None]:
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
nbaccuracy = evaluator.evaluate(predictions_df)
print("Test set accuracy = " + str(nbaccuracy))

Test set accuracy = 0.9405684754521964
