In [1]:
# Install pyspark and findspark
!pip install --ignore-install -q pyspark
# Install findspark library
!pip install --ignore-install -q findspark

In [2]:
# Import findspark
import findspark
findspark.init()

In [3]:
import sys
sys.version_info
print(sys.version)

3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]


In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


### 1. Set up spark context and SparkSession

In [5]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer,StringIndexer, RegexTokenizer,StopWordsRemover
from pyspark.ml.feature import HashingTF, IDF
from pyspark.sql.functions import col, udf,regexp_replace,isnull
from pyspark.sql.types import StringType,IntegerType
from pyspark.ml.classification import NaiveBayes, RandomForestClassifier, LogisticRegression, DecisionTreeClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import rand


In [6]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("PySpark-TextClassifier") \
    .getOrCreate()

### 2. Load dataset

In [7]:
data = spark.read.csv("/content/drive/MyDrive/Colab Notebooks/rSingapore_sample_1100comment_with_submission.csv", header=True, inferSchema=True)

In [8]:
# randomize labels
data = data.dropna()
data = data.withColumn("label", (rand() <= 0.2).cast("int"))
data.show(5)

+-------+-------------------+--------------------+-------+------------+--------------------+--------------------+-----+
|     id|                 dt|          submission|upvotes|upvote_ratio|              author|                body|label|
+-------+-------------------+--------------------+-------+------------+--------------------+--------------------+-----+
|kq2e83o|2024-02-12 10:42:02|Medicine in Malay...|    162|        0.87|           jespep831|Brunei is like Ju...|    0|
|kq2e83o|2024-02-12 10:42:02|Medicine in Malay...|    162|        0.87|           jespep831|Brunei is like Ju...|    0|
|kq2e8y7|2024-02-12 10:42:20|'Disgusting and u...|     31|        0.83|       TheLastHarlow|😭 it doesn’t hel...|    0|
|kq2e8zs|2024-02-12 10:42:21|Commentary: Low i...|     30|         0.9|Budget-Juggernaut-68|It's a tax for an...|    0|
|kq2e9m6|2024-02-12 10:42:35|Last kampung hous...|     86|        0.94|             0bxcura|"The government "...|    0|
+-------+-------------------+------------

### 3. Text Processing and Modelling



In [9]:
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="body", outputCol="words", pattern="\\W")
# stop words
add_stopwords = ["is","like","and","the"]
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features")

In [10]:
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms

In [11]:
# pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover,countVectors])
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf])
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
dataset.show(5)

+-------+-------------------+--------------------+-------+------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+
|     id|                 dt|          submission|upvotes|upvote_ratio|              author|                body|label|               words|            filtered|         rawFeatures|            features|
+-------+-------------------+--------------------+-------+------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+
|kq2e83o|2024-02-12 10:42:02|Medicine in Malay...|    162|        0.87|           jespep831|Brunei is like Ju...|    0|[brunei, is, like...|[brunei, jurassic...|(10000,[341,820,3...|(10000,[341,820,3...|
|kq2e83o|2024-02-12 10:42:02|Medicine in Malay...|    162|        0.87|           jespep831|Brunei is like Ju...|    0|[brunei, is, like...|[brunei, jurassic...|(10000,[341,820,3...|(1

In [12]:
# Train test split
(training_data, testing_data) = dataset.randomSplit([0.8, 0.2], seed=123)

In [13]:
# Logistic Regression
lr = LogisticRegression(maxIter=5, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(training_data)
predictions = lrModel.transform(testing_data)
predictions.filter(predictions['prediction'] == 1) \
    .select("body","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 20, truncate = 30)

+------------------------------+-----+----------+
|                          body|label|prediction|
+------------------------------+-----+----------+
|You should write in to SBS ...|    1|       1.0|
|Regardless of one's view of...|    0|       1.0|
|From what I hear, I'm sligh...|    0|       1.0|
|Pretty sure this is already...|    1|       1.0|
+------------------------------+-----+----------+



In [14]:
# DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=5, minInfoGain=0.001, impurity="entropy")
dtModel = dt.fit(training_data)
predictions = dtModel.transform(testing_data)


predictions.filter(predictions['prediction'] == 1) \
    .select("body","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 20, truncate = 30)

+------------------------------+-----+----------+
|                          body|label|prediction|
+------------------------------+-----+----------+
|>Our gov already turning to...|    0|       1.0|
|How about you don't do high...|    1|       1.0|
|After all the scams going o...|    0|       1.0|
|Japan - went recently. I fo...|    0|       1.0|
+------------------------------+-----+----------+



In [15]:
# RFClassifier

rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)
rfModel = rf.fit(training_data)
predictions = rfModel.transform(testing_data)

predictions.filter(predictions['prediction'] == 1) \
    .select("body","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 20, truncate = 30)

+----+-----+----------+
|body|label|prediction|
+----+-----+----------+
+----+-----+----------+



In [16]:
# GBT Classifier
gbt = GBTClassifier(labelCol="label", \
                            featuresCol="features", \
                            maxIter = 5)
gbtModel = gbt.fit(training_data)
predictions = gbtModel.transform(testing_data)

predictions.filter(predictions['prediction'] == 1) \
    .select("body","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 20, truncate = 30)

+------------------------------+-----+----------+
|                          body|label|prediction|
+------------------------------+-----+----------+
|Some people are absolute cu...|    0|       1.0|
|After all the scams going o...|    0|       1.0|
|Max grant 80k TODAY. I'm ta...|    0|       1.0|
|How about you don't do high...|    1|       1.0|
|They do have dedicated gate...|    1|       1.0|
|I think main issue with nuc...|    0|       1.0|
|>Our gov already turning to...|    0|       1.0|
|I see the future in small, ...|    0|       1.0|
|From the dimension of natio...|    0|       1.0|
+------------------------------+-----+----------+



### 4. Evaluation

In [17]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")


In [18]:

# ParamGrid
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())
# n-fold CrossValidator
cv = CrossValidator(estimator=rf, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=4)
cvModel = cv.fit(training_data)

predictions = cvModel.transform(testing_data)

In [19]:
accuracy = evaluator.evaluate(predictions)

print(f"Test Accuracy: {accuracy:.2f}")

Test Accuracy: 0.79


5. Visualize the Decision Tree

In [20]:
print(rfModel.toDebugString)

RandomForestClassificationModel: uid=RandomForestClassifier_98ad0ea3ebe1, numTrees=100, numClasses=2, numFeatures=10000
  Tree 0 (weight 1.0):
    If (feature 3996 <= 2.3093572055479243)
     If (feature 594 <= 1.9192779267731368)
      If (feature 6501 <= 2.4096925532789997)
       If (feature 8501 <= 2.612425107333082)
        Predict: 0.0
       Else (feature 8501 > 2.612425107333082)
        Predict: 1.0
      Else (feature 6501 > 2.4096925532789997)
       Predict: 1.0
     Else (feature 594 > 1.9192779267731368)
      If (feature 3921 <= 1.376964903426452)
       If (feature 5741 <= 1.3172915246101689)
        Predict: 0.0
       Else (feature 5741 > 1.3172915246101689)
        Predict: 1.0
      Else (feature 3921 > 1.376964903426452)
       Predict: 1.0
    Else (feature 3996 > 2.3093572055479243)
     Predict: 0.0
  Tree 1 (weight 1.0):
    If (feature 5169 <= 2.122010480827219)
     If (feature 9147 <= 2.225830163216341)
      If (feature 9797 <= 2.535349767419453)
       If 

In [21]:
# Stop the Spark session
spark.stop()