In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').appName('test').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/22 11:29:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [10]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
schema = StructType([
    StructField("text", StringType()),
    StructField("label", IntegerType())
])
txt = spark.read.csv('text.csv', header=False, schema=schema)

In [11]:
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import Tokenizer
wrangled = txt.withColumn('text', regexp_replace(txt.text, '[_():;,.!?\\-]', ' '))
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, '[0-9]', ' '))
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, ' +', ' '))
wrangled = Tokenizer(inputCol='text', outputCol='words').transform(wrangled)
wrangled.show(4, truncate=False)

+------------------------+-----+-----------------------------+
|text                    |label|words                        |
+------------------------+-----+-----------------------------+
|hari ini cerah          |0    |[hari, ini, cerah]           |
|besok makan terang bulan|1    |[besok, makan, terang, bulan]|
|kemarin makan nasi      |1    |[kemarin, makan, nasi]       |
|hari esok cerah         |0    |[hari, esok, cerah]          |
+------------------------+-----+-----------------------------+



                                                                                

In [13]:
from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF
txt = wrangled.select('words', 'label')
wrangled = StopWordsRemover(inputCol='words', outputCol='terms').transform(sms)
wrangled = HashingTF(inputCol='terms', outputCol='hash', numFeatures=1024).transform(wrangled)
tf_idf = IDF(inputCol='hash', outputCol='features').fit(wrangled).transform(wrangled)
tf_idf.select('terms', 'features').show(4, truncate=False)

22/08/22 11:36:21 WARN StopWordsRemover: Default locale set was [en_ID]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.


                                                                                

+-----------------------------+----------------------------------------------------------------------------------------------------+
|terms                        |features                                                                                            |
+-----------------------------+----------------------------------------------------------------------------------------------------+
|[hari, ini, cerah]           |(1024,[687,817,969],[0.9162907318741551,0.5108256237659907,0.5108256237659907])                     |
|[besok, makan, terang, bulan]|(1024,[47,53,253,327],[0.5108256237659907,0.9162907318741551,0.9162907318741551,0.9162907318741551])|
|[kemarin, makan, nasi]       |(1024,[47,145,598],[0.5108256237659907,0.9162907318741551,0.9162907318741551])                      |
|[hari, esok, cerah]          |(1024,[510,817,969],[0.9162907318741551,0.5108256237659907,0.5108256237659907])                     |
+-----------------------------+--------------------------------------

In [14]:
from pyspark.ml.classification import LogisticRegression
sms = tf_idf.select('label', 'features')
sms_train, sms_test = sms.randomSplit([0.8, 0.2], seed=13)
logistic = LogisticRegression(regParam=0.2).fit(sms_train)
prediction = logistic.transform(sms_test)
prediction.groupBy('label', 'prediction').count().show()

22/08/22 11:41:03 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/08/22 11:41:03 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/08/22 11:41:03 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/08/22 11:41:03 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    0|       0.0|    1|
+-----+----------+-----+

