In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').appName('test').getOrCreate()

In [2]:
from random import randint
filetext = open('text.csv','w')
makanan = ['sate','bakso','mie ayam','gado gado','ayam geprek','tahu tek']
tujuan = ['lapangan sepak bola','mall','pantai','gunung','hutan','isekai']
for x in range(10000):
    if randint(0,1)==0:
        filetext.write('"saya suka makan '+
                       str(makanan[randint(0,len(makanan)-1)])+
                      '",0\n')
    else:
        filetext.write('"saya sering pergi ke '+
                       str(tujuan[randint(0,len(tujuan)-1)])+
                      '",1\n')
filetext.close()

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
schema = StructType([
    StructField("text", StringType()),
    StructField("label", IntegerType())
])
txt = spark.read.csv('text.csv', header=False, schema=schema)

In [5]:
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import Tokenizer
wrangled = txt.withColumn('text', regexp_replace(txt.text, '[_():;,.!?\\-]', ' '))
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, '[0-9]', ' '))
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, ' +', ' '))
wrangled = Tokenizer(inputCol='text', outputCol='words').transform(wrangled)
wrangled.show(4, truncate=False)

+---------------------------+-----+---------------------------------+
|text                       |label|words                            |
+---------------------------+-----+---------------------------------+
|saya suka makan tahu tek   |0    |[saya, suka, makan, tahu, tek]   |
|saya sering pergi ke hutan |1    |[saya, sering, pergi, ke, hutan] |
|saya sering pergi ke hutan |1    |[saya, sering, pergi, ke, hutan] |
|saya sering pergi ke pantai|1    |[saya, sering, pergi, ke, pantai]|
+---------------------------+-----+---------------------------------+
only showing top 4 rows



In [7]:
from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF
txt = wrangled.select('words', 'label')
wrangled = StopWordsRemover(inputCol='words', outputCol='terms').transform(txt)
wrangled = HashingTF(inputCol='terms', outputCol='hash', numFeatures=1024).transform(wrangled)
tf_idf = IDF(inputCol='hash', outputCol='features').fit(wrangled).transform(wrangled)
tf_idf.select('terms', 'features').show(4, truncate=False)

+---------------------------------+--------------------------------------------------------------------------------------------------------------+
|terms                            |features                                                                                                      |
+---------------------------------+--------------------------------------------------------------------------------------------------------------+
|[saya, suka, makan, tahu, tek]   |(1024,[47,398,487,505,902],[0.7014809804873822,0.7014809804873822,0.0,2.5183566244698876,2.5183566244698876]) |
|[saya, sering, pergi, ke, hutan] |(1024,[145,411,487,844,965],[0.6846839448997907,0.6846839448997907,0.0,2.4180959395057067,0.6846839448997907])|
|[saya, sering, pergi, ke, hutan] |(1024,[145,411,487,844,965],[0.6846839448997907,0.6846839448997907,0.0,2.4180959395057067,0.6846839448997907])|
|[saya, sering, pergi, ke, pantai]|(1024,[37,145,411,487,965],[2.532098252322184,0.6846839448997907,0.6846839448997907

In [8]:
from pyspark.ml.classification import LogisticRegression
txt = tf_idf.select('label', 'features')
txt_train, txt_test = txt.randomSplit([0.8, 0.2], seed=13)
logistic = LogisticRegression(regParam=0.2).fit(txt_train)
prediction = logistic.transform(txt_test)
prediction.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    0|       0.0|  961|
|    1|       1.0|  978|
+-----+----------+-----+

