## Import Spark SQL, library machine learning, membuat session

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.regression import LinearRegression 
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover 

# membuat session 
appName = "Automated Scoring System menggunakan PySpark"
spark = SparkSession\
.builder\
.appName(appName)\
.config("spark.some.config.option","some-value").getOrCreate()

## Memuat data file 

In [7]:
# membaca data dari file ke DataFrame dengan skema diinfer
data_skor = spark.read.option ("delimiter", ";").csv ('training_data_essay.csv', header = True)
data_skor.show (truncate = True)

+----------+------------+--------------------+----+-------------+
|       npm|nama_peserta|             jawaban|soal|skor_per_soal|
+----------+------------+--------------------+----+-------------+
|         0|       Admin|Tidak, Hanya memb...|   1|          100|
|         0|       Admin|Biaya dihitung be...|   2|          100|
|         0|       Admin|Hak cipta adalah ...|   3|          100|
|         0|       Admin|Dijelaskan kepada...|   4|          100|
|         0|       Admin|1. Melindungi dan...|   5|          100|
|         0|       Admin|Ruang Komputer, P...|   6|          100|
|         0|       Admin|Aturlah posisi pe...|   7|          100|
|         0|       Admin|Posisi Kepala dan...|   8|          100|
|         0|       Admin|1. Kecocokan soft...|   9|          100|
|         0|       Admin|1. Fokus dan expo...|  10|          100|
|         0|       Admin|1. Peralatan yang...|  11|          100|
|         0|       Admin|1. Dibuat grafik ...|  12|          100|
|112102003

## Menyiapkan data 

In [5]:
data_types = data_skor.dtypes

# Print the data types
for col_name, data_type in data_types:
  print(f"Column: {col_name}, Data Type: {data_type}")

Column: npm, Data Type: string
Column: nama_peserta, Data Type: string
Column: jawaban, Data Type: string
Column: soal, Data Type: string
Column: skor_per_soal, Data Type: string


In [10]:
from pyspark.sql.functions import col, regexp_replace

# Assuming "data_skor" is your DataFrame and "skor_per_soal" is the column
data = data_skor.select(
    "soal",
    "jawaban",
    regexp_replace(col("skor_per_soal"), r",", ".").alias("label")
)

In [11]:
data.show ()

+----+--------------------+-----+
|soal|             jawaban|label|
+----+--------------------+-----+
|   1|Tidak, Hanya memb...|  100|
|   2|Biaya dihitung be...|  100|
|   3|Hak cipta adalah ...|  100|
|   4|Dijelaskan kepada...|  100|
|   5|1. Melindungi dan...|  100|
|   6|Ruang Komputer, P...|  100|
|   7|Aturlah posisi pe...|  100|
|   8|Posisi Kepala dan...|  100|
|   9|1. Kecocokan soft...|  100|
|  10|1. Fokus dan expo...|  100|
|  11|1. Peralatan yang...|  100|
|  12|1. Dibuat grafik ...|  100|
|   1|tidak, cuma mengi...| 52.7|
|   2|biaya dihitung be...|42.86|
|   3|hak membuat merup...|42.16|
|   4|dipaparkan pada k...|27.19|
|   5|1. mencegah serta...|44.14|
|   6|ruang komputer, p...|  100|
|   7|aturlah posisi fi...|57.68|
|   8|posisi kepala ser...|45.71|
+----+--------------------+-----+
only showing top 20 rows



In [15]:
data_types1 = data.dtypes

# Print the data types
for col_name, data_type1 in data_types1:
    print(f"Column: {col_name}, Data Type: {data_type1}")

Column: soal, Data Type: string
Column: jawaban, Data Type: string
Column: label, Data Type: string


In [56]:
# memilih data dari kolom "jawaban" dan kolom "skor_per_soal"
# kemudian meng-casting nilai di kolom "skor_per_soal" ke tipe integer dan mengganti nama menjadi "label"
data1 = data.select( col('soal').cast("int"), "jawaban",col("label").cast("float").alias ("label"))
data1.show (truncate= True)

+----+--------------------+-----+
|soal|             jawaban|label|
+----+--------------------+-----+
|   1|Tidak, Hanya memb...|100.0|
|   2|Biaya dihitung be...|100.0|
|   3|Hak cipta adalah ...|100.0|
|   4|Dijelaskan kepada...|100.0|
|   5|1. Melindungi dan...|100.0|
|   6|Ruang Komputer, P...|100.0|
|   7|Aturlah posisi pe...|100.0|
|   8|Posisi Kepala dan...|100.0|
|   9|1. Kecocokan soft...|100.0|
|  10|1. Fokus dan expo...|100.0|
|  11|1. Peralatan yang...|100.0|
|  12|1. Dibuat grafik ...|100.0|
|   1|tidak, cuma mengi...| 52.7|
|   2|biaya dihitung be...|42.86|
|   3|hak membuat merup...|42.16|
|   4|dipaparkan pada k...|27.19|
|   5|1. mencegah serta...|44.14|
|   6|ruang komputer, p...|100.0|
|   7|aturlah posisi fi...|57.68|
|   8|posisi kepala ser...|45.71|
+----+--------------------+-----+
only showing top 20 rows



## Memisahkan data training dan testing

In [57]:
dataTerpisah = data1.randomSplit ([0.7, 0.3])
train = dataTerpisah [0]

# pada data testing, rename label dari "Label" ke "trueLabel"
test = dataTerpisah [1]. withColumnRenamed ("label", "trueLabel")
train_rows = train.count()
test_rows = test.count()
print (f"Jumlah baris data training :{train_rows} \n Jumlah baris data testing : {test_rows}")

Jumlah baris data training :81 
 Jumlah baris data testing : 39


## Menyiapkan data testing

In [58]:
tokenizer = Tokenizer (inputCol = "jawaban", outputCol = "jawabanWord")
tokenizerTrain = tokenizer.transform (train)
tokenizerTrain.show (truncate = True, n= 5)

+----+--------------------+-----+--------------------+
|soal|             jawaban|label|         jawabanWord|
+----+--------------------+-----+--------------------+
|   1|Tidak, Hanya memb...|100.0|[tidak,, hanya, m...|
|   1|tidak, hanya memb...|100.0|[tidak,, hanya, m...|
|   1|tidak, hanya memb...|100.0|[tidak,, hanya, m...|
|   1|tidak, hanya memb...|100.0|[tidak,, hanya, m...|
|   1|tidak, hanya memb...|100.0|[tidak,, hanya, m...|
+----+--------------------+-----+--------------------+
only showing top 5 rows



In [59]:
import nltk 
nltk.download ('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vannesa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [60]:
from pyspark import SparkContext 
from nltk.corpus import stopwords

sc = SparkContext.getOrCreate()

Stopwords = stopwords.words ('indonesian')

swr = StopWordsRemover(inputCol = tokenizer.getOutputCol(), outputCol = "MeaningfulWords", stopWords = Stopwords)

SwRemovedTrain = swr.transform (tokenizerTrain)
SwRemovedTrain.show (truncate = True, n=5)

+----+--------------------+-----+--------------------+--------------------+
|soal|             jawaban|label|         jawabanWord|     MeaningfulWords|
+----+--------------------+-----+--------------------+--------------------+
|   1|Tidak, Hanya memb...|100.0|[tidak,, hanya, m...|[tidak,, membutuh...|
|   1|tidak, hanya memb...|100.0|[tidak,, hanya, m...|[tidak,, membutuh...|
|   1|tidak, hanya memb...|100.0|[tidak,, hanya, m...|[tidak,, membutuh...|
|   1|tidak, hanya memb...|100.0|[tidak,, hanya, m...|[tidak,, membutuh...|
|   1|tidak, hanya memb...|100.0|[tidak,, hanya, m...|[tidak,, membutuh...|
+----+--------------------+-----+--------------------+--------------------+
only showing top 5 rows



In [69]:
# ALS
hashedData= SwRemovedTrain.withColumn ("hashedValue", hash ("MeaningfulWords"))
hashedData.show (n = 5)
# LinearRegression
hashTF = HashingTF (inputCol = swr.getInputCol(), outputCol = "features")
numericTrain = hashTF.transform (SwRemovedTrain).select ('soal','label', 'MeaningfulWords','features')
numericTrain.show (truncate = True, n= 3)

+----+--------------------+-----+--------------------+--------------------+-----------+
|soal|             jawaban|label|         jawabanWord|     MeaningfulWords|hashedValue|
+----+--------------------+-----+--------------------+--------------------+-----------+
|   1|Tidak, Hanya memb...|100.0|[tidak,, hanya, m...|[tidak,, membutuh...| 1529302238|
|   1|tidak, hanya memb...|100.0|[tidak,, hanya, m...|[tidak,, membutuh...| 1529302238|
|   1|tidak, hanya memb...|100.0|[tidak,, hanya, m...|[tidak,, membutuh...| 1529302238|
|   1|tidak, hanya memb...|100.0|[tidak,, hanya, m...|[tidak,, membutuh...| 1529302238|
|   1|tidak, hanya memb...|100.0|[tidak,, hanya, m...|[tidak,, membutuh...| 1529302238|
+----+--------------------+-----+--------------------+--------------------+-----------+
only showing top 5 rows

+----+-----+--------------------+--------------------+
|soal|label|     MeaningfulWords|            features|
+----+-----+--------------------+--------------------+
|   1|100.0|[tidak

In [70]:
numericTrain.dtypes

[('soal', 'int'),
 ('label', 'float'),
 ('MeaningfulWords', 'array<string>'),
 ('features', 'vector')]

In [71]:
numericTrain.dropna()

DataFrame[soal: int, label: float, MeaningfulWords: array<string>, features: vector]

In [72]:
numericTrain.show()

+----+-----+--------------------+--------------------+
|soal|label|     MeaningfulWords|            features|
+----+-----+--------------------+--------------------+
|   1|100.0|[tidak,, membutuh...|(262144,[22138,79...|
|   1|100.0|[tidak,, membutuh...|(262144,[22138,79...|
|   1|100.0|[tidak,, membutuh...|(262144,[22138,79...|
|   1|100.0|[tidak,, membutuh...|(262144,[22138,79...|
|   1|100.0|[tidak,, membutuh...|(262144,[22138,79...|
|   1|100.0|[tidak,, membutuh...|(262144,[22138,79...|
|   1|100.0|[tidak,, membutuh...|(262144,[22138,79...|
|   2|84.52|[biaya, dihitung,...|(262144,[18111,56...|
|   2|84.52|[biaya, dihitung,...|(262144,[18111,56...|
|   2|100.0|[biaya, dihitung,...|(262144,[18111,56...|
|   2|100.0|[biaya, dihitung,...|(262144,[18111,56...|
|   2|100.0|[biaya, dihitung,...|(262144,[18111,56...|
|   2|100.0|[biaya, dihitung,...|(262144,[18111,56...|
|   2|42.86|[biaya, dihitung,...|(262144,[23999,26...|
|   2|26.73|[perhitungan, biaya]|(262144,[155651,2...|
|   3|100.

## Mentraining model dengan data training

In [65]:
# linear regression 
from pyspark.ml.regression import LinearRegression 

lr = LinearRegression (labelCol = "label", featuresCol = "features", maxIter = 10, regParam = 0.01)
model = lr.fit (numericTrain)
print ('Training selesai')

Training selesai


In [73]:
# ALS 
als = ALS (maxIter = 10, regParam = 0.01, userCol = "soal", itemCol = "hashedValue", ratingCol = "label")
model_ALS = als.fit (hashedData)
print ("Training selesai ")

Training selesai 


## Menyiapkan data testing

In [107]:
# Linear Regression 
tokenizedTest = tokenizer.transform (test)
SwRemovedTest = swr.transform (tokenizedTest)
numericTest = hashTF.transform (SwRemovedTest).select ('trueLabel', 'MeaningfulWords','features')
numericTest.show (n= 2)

# ALS 
hashedDataTest= SwRemovedTest.withColumn ("hashedValue", hash ("MeaningfulWords"))
hashedDataTest.show (n = 5)

+---------+--------------------+--------------------+
|trueLabel|     MeaningfulWords|            features|
+---------+--------------------+--------------------+
|     52.7|[tidak,, aplikasi...|(262144,[110797,1...|
|    100.0|[tidak,, membutuh...|(262144,[22138,79...|
+---------+--------------------+--------------------+
only showing top 2 rows

+----+--------------------+---------+--------------------+--------------------+-----------+
|soal|             jawaban|trueLabel|         jawabanWord|     MeaningfulWords|hashedValue|
+----+--------------------+---------+--------------------+--------------------+-----------+
|   1|tidak, cuma mengi...|     52.7|[tidak,, cuma, me...|[tidak,, aplikasi...|-1004957661|
|   1|tidak, hanya memb...|    100.0|[tidak,, hanya, m...|[tidak,, membutuh...| 1529302238|
|   1|tidak, hanya memb...|    100.0|[tidak,, hanya, m...|[tidak,, membutuh...| 1529302238|
|   2|Biaya dihitung be...|    100.0|[biaya, dihitung,...|[biaya, dihitung,...|  868079823|
|   2|b

## Memprediksi dan menghitung akurasi model menggunakan algoritma Linear Regression dan ALS

In [108]:
# Linear Regression
prediksiMentah = model.transform (numericTest)
prediksiFinal = prediksiMentah.select ('MeaningfulWords', 'prediction', 'truelabel')
prediksiFinal.show (truncate = True, n=40)
prediksiBenar = prediksiFinal.filter (prediksiFinal ['prediction'] == prediksiFinal ['trueLabel']).count ()
totalData = prediksiFinal.count()


+--------------------+------------------+---------+
|     MeaningfulWords|        prediction|truelabel|
+--------------------+------------------+---------+
|[tidak,, aplikasi...|  79.8792188552439|     52.7|
|[tidak,, membutuh...| 98.57262375514925|    100.0|
|[tidak,, membutuh...| 98.57262375514925|    100.0|
|[biaya, dihitung,...|100.13572564312118|    100.0|
|[biaya, dihitung,...|100.13572564312118|    100.0|
|[emperbanyak, cip...| 82.99596695988859|    47.67|
|[hak, cipta, hak,...| 90.66035046080486|    91.71|
|[hak, cipta, hak,...| 90.66035046080486|    91.71|
|[hak, cipta, hak,...| 90.66035046080486|    91.71|
|[hak, cipta, hak,...| 90.66035046080486|    91.71|
|[klien, huruf, me...| 73.07465920532428|    57.94|
|[melindungi, menj...|  74.0765300988127|    74.73|
|[melindungi, menj...|  74.0765300988127|    74.73|
|[melindungi, menj...|  74.0765300988127|    74.73|
|[melindungi, menj...| 74.48630575113266|    74.73|
|[melindungi, menj...| 84.66502828443492|    90.27|
|[ruang, kom

In [78]:
from pyspark.ml.evaluation import RegressionEvaluator 

evaluator = RegressionEvaluator (labelCol = "truelabel", predictionCol = "prediction", metricName = 'rmse')
rmse = evaluator.evaluate (prediksiFinal)
print (f"Nilai RMSE (Root Mean Square Error) adalah {rmse}")

Nilai RMSE (Root Mean Square Error) adalah 11.593759948657606


In [79]:
# ALS 
test_predict_ALS = model_ALS.transform(hashedDataTest)
test_predict_ALS.show ()

+----+--------------------+---------+--------------------+--------------------+-----------+----------+
|soal|             jawaban|trueLabel|         jawabanWord|     MeaningfulWords|hashedValue|prediction|
+----+--------------------+---------+--------------------+--------------------+-----------+----------+
|   1|tidak, cuma mengi...|     52.7|[tidak,, cuma, me...|[tidak,, aplikasi...|-1004957661|       NaN|
|   1|tidak, hanya memb...|    100.0|[tidak,, hanya, m...|[tidak,, membutuh...| 1529302238| 99.999886|
|   1|tidak, hanya memb...|    100.0|[tidak,, hanya, m...|[tidak,, membutuh...| 1529302238| 99.999886|
|   6|Ruang Komputer, P...|    100.0|[ruang, komputer,...|[ruang, komputer,...| 2137504343|  99.99986|
|   6|posisi tubuh, pos...|    90.37|[posisi, tubuh,, ...|[posisi, tubuh,, ...|   69233312|       NaN|
|   6|ruang komputer, p...|    100.0|[ruang, komputer,...|[ruang, komputer,...| 2137504343|  99.99986|
|   6|ruang komputer, p...|    100.0|[ruang, komputer,...|[ruang, kompute

In [82]:
evaluator_ALS = RegressionEvaluator (labelCol = "trueLabel", predictionCol = "prediction", metricName = 'rmse')
rmse = evaluator_ALS.evaluate (test_predict_ALS)
print (f"Nilai RMSE (Root Mean Square Error) adalah {rmse}")

Nilai RMSE (Root Mean Square Error) adalah nan


In [83]:
testNa = test_predict_ALS.count ()
hapusTestNa = test_predict_ALS.dropna (subset = ["prediction"], how = "any")
testNaClean = hapusTestNa.count ()
print (f"Sebelum missingvalue dihapus {testNa} \n Setelah missing value dihapus {testNaClean} \n Nilai prediction yang kosong sebanyak {testNa - testNaClean}" )

Sebelum missingvalue dihapus 39 
 Setelah missing value dihapus 18 
 Nilai prediction yang kosong sebanyak 21


In [85]:
hapusTestNa.show()

+----+--------------------+---------+--------------------+--------------------+-----------+----------+
|soal|             jawaban|trueLabel|         jawabanWord|     MeaningfulWords|hashedValue|prediction|
+----+--------------------+---------+--------------------+--------------------+-----------+----------+
|  12|dibuat grafik yan...|    86.53|[dibuat, grafik, ...|[grafik, prinsip,...|-1017500748|  86.52983|
|  12|dibuat grafik yan...|    86.53|[dibuat, grafik, ...|[grafik, prinsip,...|-1017500748|  86.52983|
|   1|tidak, hanya memb...|    100.0|[tidak,, hanya, m...|[tidak,, membutuh...| 1529302238| 99.999886|
|   1|tidak, hanya memb...|    100.0|[tidak,, hanya, m...|[tidak,, membutuh...| 1529302238| 99.999886|
|   6|Ruang Komputer, P...|    100.0|[ruang, komputer,...|[ruang, komputer,...| 2137504343|  99.99986|
|   6|ruang komputer, p...|    100.0|[ruang, komputer,...|[ruang, komputer,...| 2137504343|  99.99986|
|   6|ruang komputer, p...|    100.0|[ruang, komputer,...|[ruang, kompute

# Praktik Model dengan Data Baru 

In [99]:
data_baru = spark.read.option ("delimiter", ",").csv ('data_baru.csv', header = True)
data_baru = data_baru.select (col ('soal').cast ('int'), 'jawaban')
data_baru.show (n=3)

+----+--------------------+
|soal|             jawaban|
+----+--------------------+
|   1|Software atau per...|
|   2|biaya hitung dill...|
|   3|hak cipta telah d...|
+----+--------------------+
only showing top 3 rows



## ALS 

In [100]:
hashedData2 = data_baru.withColumn ("hashedValue", hash ("jawaban"))
hashedData2.select (col('soal').cast ('int'), 'jawaban', 'hashedValue').show()

+----+--------------------+-----------+
|soal|             jawaban|hashedValue|
+----+--------------------+-----------+
|   1|Software atau per...|  614573843|
|   2|biaya hitung dill...|-1737552876|
|   3|hak cipta telah d...| -522118620|
|   4|pada ui ux kompos...| 1287655219|
|   5|dalam menciptakan...|  885020834|
|   6|komputer, softwar...|   30075393|
|   7|suhu udara sangat...| 1921198688|
|   8|dalam bahasa php ...|   61102604|
|   9|janera dab ekspos...| 1072695114|
|  10|operasi kamera di...|-1783918903|
|  11|transfer image bi...| 1164721385|
|  12|Visualisasi data ...| 2136308032|
+----+--------------------+-----------+



In [101]:
hashedData2.dtypes

[('soal', 'int'), ('jawaban', 'string'), ('hashedValue', 'int')]

In [102]:
predict_dataBaru = model_ALS.transform (hashedData2)
predict_dataBaru.show()

+----+--------------------+-----------+----------+
|soal|             jawaban|hashedValue|prediction|
+----+--------------------+-----------+----------+
|  12|Visualisasi data ...| 2136308032|       NaN|
|   1|Software atau per...|  614573843|       NaN|
|   6|komputer, softwar...|   30075393|       NaN|
|   3|hak cipta telah d...| -522118620|       NaN|
|   5|dalam menciptakan...|  885020834|       NaN|
|   9|janera dab ekspos...| 1072695114|       NaN|
|   4|pada ui ux kompos...| 1287655219|       NaN|
|   8|dalam bahasa php ...|   61102604|       NaN|
|   7|suhu udara sangat...| 1921198688|       NaN|
|  10|operasi kamera di...|-1783918903|       NaN|
|  11|transfer image bi...| 1164721385|       NaN|
|   2|biaya hitung dill...|-1737552876|       NaN|
+----+--------------------+-----------+----------+



## Linear Regression

In [120]:
tokenizer_db = Tokenizer (inputCol = "jawaban", outputCol = "jawabanWord")
tokenizer_dataBaru = tokenizer.transform (data_baru)
# tokenizer_dataBaru.show (truncate = True, n= 5)
swr_db = StopWordsRemover(inputCol = tokenizer_db.getOutputCol(), outputCol = "MeaningfulWords", stopWords = Stopwords)
SwRemoved_dataBaru = swr_db.transform (tokenizer_dataBaru)
# SwRemoved_dataBaru.show (truncate = True, n=5)
hashTF_db = HashingTF (inputCol = swr_db.getInputCol(), outputCol = "features")
numeric_db = hashTF_db.transform (SwRemoved_dataBaru).select ('MeaningfulWords','features')
# numeric_db.show (n= 2)
prediksi_dataBaru = model.transform (numeric_db)
prediksi_dataBaru.select ('MeaningfulWords', 'prediction').show()

+--------------------+-----------------+
|     MeaningfulWords|       prediction|
+--------------------+-----------------+
|[software, perang...|76.30296307543964|
|[biaya, hitung, d...|54.43178123581619|
|[hak, cipta, diat...|75.38342909173426|
|[ui, ux, komposis...|70.48831635693952|
|[menciptakan, keb...|77.94684631859252|
|[komputer,, softw...| 75.9182691179372|
|[suhu, udara, ber...| 68.8193460068955|
|[bahasa, php, ket...|70.06438087688855|
|[janera, dab, eks...| 72.3529435441346|
|[operasi, kamera,...|69.76544920634207|
|[transfer, image,...|69.79784940146264|
|[visualisasi, dat...| 70.9032578664313|
+--------------------+-----------------+



In [116]:
tokenizer_dataBaru.show()

+----+--------------------+--------------------+
|soal|             jawaban|         jawabanWord|
+----+--------------------+--------------------+
|   1|Software atau per...|[software, atau, ...|
|   2|biaya hitung dill...|[biaya, hitung, d...|
|   3|hak cipta telah d...|[hak, cipta, tela...|
|   4|pada ui ux kompos...|[pada, ui, ux, ko...|
|   5|dalam menciptakan...|[dalam, menciptak...|
|   6|komputer, softwar...|[komputer,, softw...|
|   7|suhu udara sangat...|[suhu, udara, san...|
|   8|dalam bahasa php ...|[dalam, bahasa, p...|
|   9|janera dab ekspos...|[janera, dab, eks...|
|  10|operasi kamera di...|[operasi, kamera,...|
|  11|transfer image bi...|[transfer, image,...|
|  12|Visualisasi data ...|[visualisasi, dat...|
+----+--------------------+--------------------+



In [117]:
SwRemoved_dataBaru.show (truncate = True, n=5)

+----+--------------------+--------------------+--------------------+
|soal|             jawaban|         jawabanWord|     MeaningfulWords|
+----+--------------------+--------------------+--------------------+
|   1|Software atau per...|[software, atau, ...|[software, perang...|
|   2|biaya hitung dill...|[biaya, hitung, d...|[biaya, hitung, d...|
|   3|hak cipta telah d...|[hak, cipta, tela...|[hak, cipta, diat...|
|   4|pada ui ux kompos...|[pada, ui, ux, ko...|[ui, ux, komposis...|
|   5|dalam menciptakan...|[dalam, menciptak...|[menciptakan, keb...|
+----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [118]:
numeric_db.show (n= 2)

+--------------------+--------------------+
|     MeaningfulWords|            features|
+--------------------+--------------------+
|[software, perang...|(262144,[13851,21...|
|[biaya, hitung, d...|(262144,[44454,45...|
+--------------------+--------------------+
only showing top 2 rows

