In [1]:
#instal pyspark
!pip install pyspark

Defaulting to user installation because normal site-packages is not writeable


In [2]:
#import necessary libraries
import re

from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.ml import Pipeline

from pyspark.ml.feature import *

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, RegexTokenizer
from pyspark.ml.feature import Word2Vec
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import time

In [3]:
#start the spark session
spark = SparkSession.builder \
     .master('local[10]') \
     .appName('Spark NLP') \
     .config("spark.driver.memory", "8g") \
     .config("spark.sql.autoBroadcastJoinThreshold", -1) \
     .getOrCreate()

In [4]:
#load train and test data
train_df = spark.read.csv('train.csv', header=True)
test_df = spark.read.csv('test.csv', header=True)

In [5]:
test_df.count()

1380083

In [6]:
# create a train DataFrame containing the sample data
train = train_df.where("label = 1").limit(20000).union(
            train_df.where("label = 2").limit(20000)).union(
            train_df.where("label = 3").limit(20000)).union(
            train_df.where("label = 4").limit(20000)).union(
            train_df.where("label = 5").limit(20000))

In [7]:
# create a test DataFrame containing the sample data
test = test_df.where("label = 1").limit(2000).union(
            test_df.where("label = 2").limit(2000)).union(
            test_df.where("label = 3").limit(2000)).union(
            test_df.where("label = 4").limit(2000)).union(
            test_df.where("label = 5").limit(2000))

In [8]:
df = train.union(test)
df.show(5)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|Great selection o...|    1|
|A Disappointing R...|    1|
|Just Listen to Io...|    1|
|Craapshow     p  ...|    1|
|Waste of time and...|    1|
+--------------------+-----+
only showing top 5 rows



### Word 2 Vec Embedding

In [10]:
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")

# Define Word2Vec model with input and output columns
word2vec = Word2Vec(vectorSize=100, inputCol="words", outputCol="embeddings")

label_stringIdx = StringIndexer(inputCol = "label", outputCol = "label_1")

#create a pipeline
pipeline = Pipeline(stages=[regexTokenizer, word2vec, label_stringIdx])

# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)

dataset = dataset.drop('label')

In [11]:
dataset.show(5)

+--------------------+--------------------+--------------------+-------+
|                text|               words|          embeddings|label_1|
+--------------------+--------------------+--------------------+-------+
|Great selection o...|[great, selection...|[0.03813548842647...|    0.0|
|A Disappointing R...|[a, disappointing...|[0.03239694607651...|    0.0|
|Just Listen to Io...|[just, listen, to...|[0.03977035559745...|    0.0|
|Craapshow     p  ...|[craapshow, p, i,...|[0.08215042860364...|    0.0|
|Waste of time and...|[waste, of, time,...|[0.05910389709540...|    0.0|
+--------------------+--------------------+--------------------+-------+
only showing top 5 rows



In [12]:
# create a train DataFrame containing the sample data
train_df = dataset.where("label = 1").limit(20000).union(
            dataset.where("label = 2").limit(20000)).union(
            dataset.where("label = 3").limit(20000)).union(
            dataset.where("label = 4").limit(20000)).union(
            dataset.where("label = 5").limit(20000))

# create the test data
test_df = dataset.subtract(train_df)

In [13]:
train_df.show(10)

+--------------------+--------------------+--------------------+-------+
|                text|               words|          embeddings|label_1|
+--------------------+--------------------+--------------------+-------+
|Great selection o...|[great, selection...|[0.03813548842647...|    0.0|
|A Disappointing R...|[a, disappointing...|[0.03239694607651...|    0.0|
|Just Listen to Io...|[just, listen, to...|[0.03977035559745...|    0.0|
|Craapshow     p  ...|[craapshow, p, i,...|[0.08215042860364...|    0.0|
|Waste of time and...|[waste, of, time,...|[0.05910389709540...|    0.0|
|Long and boring  ...|[long, and, borin...|[0.02130732339962...|    0.0|
|too bad  This tur...|[too, bad, this, ...|[-0.0653338500026...|    0.0|
|House of Leaves  ...|[house, of, leave...|[0.03834786598662...|    0.0|
|Disappointing in ...|[disappointing, i...|[0.01091536244273...|    0.0|
|Amazons stock rev...|[amazons, stock, ...|[0.02829726426690...|    0.0|
+--------------------+--------------------+--------

In [14]:
test_df.show(10)

+--------------------+--------------------+--------------------+-------+
|                text|               words|          embeddings|label_1|
+--------------------+--------------------+--------------------+-------+
|Fast cars big sta...|[fast, cars, big,...|[0.04036077261571...|    0.0|
|Great Series But ...|[great, series, b...|[0.01737717171966...|    2.0|
|great app  i love...|[great, app, i, l...|[0.05581083594851...|    3.0|
|slow and steady  ...|[slow, and, stead...|[0.08293303362020...|    2.0|
|Mediocre or less ...|[mediocre, or, le...|[0.04449997604467...|    0.0|
|NOT BAD  The game...|[not, bad, the, g...|[0.04350141303673...|    2.0|
|Good fun  Exercis...|[good, fun, exerc...|[0.01396827468441...|    1.0|
|Its OK  Not bad o...|[its, ok, not, ba...|[0.03238478127673...|    1.0|
|Neil keeps on goi...|[neil, keeps, on,...|[0.07560510362188...|    2.0|
|Worthy Holmes Com...|[worthy, holmes, ...|[0.07943817782683...|    1.0|
+--------------------+--------------------+--------

### Random Forest Classifier

In [15]:
start_time = time.time()

# Define embeddings column as the input feature
input_col = "embeddings"

# Create RandomForestClassifier model
rf = RandomForestClassifier(featuresCol = input_col, labelCol = 'label_1')

# Define the evaluator for classification accuracy
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="label_1", predictionCol="prediction", metricName="accuracy")

# Create ParamGrid for hyperparameter tuning
paramGrid = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [4, 10, 15]) \
    .addGrid(rf.numTrees, [50, 100, 150]) \
    .addGrid(rf.maxBins, [20, 32, 64]) \
    .build()

# Create a 5-fold CrossValidator
crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator_accuracy,
                          numFolds=5,
                          parallelism=5)

# Train the machine learning model with k-fold cross-validation
cvModel = crossval.fit(train_df)

# Record end time
end_time = time.time()

# Compute running time in seconds
running_time = end_time - start_time

# Convert running time to hours, minutes, and seconds
hours = int(running_time // 3600)
minutes = int((running_time % 3600) // 60)
seconds = int((running_time % 3600) % 60)

# Print start time, end time, and running time in hours, minutes, and seconds
print("Start Time: ", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time)))
print("End Time: ", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(end_time)))
print("Running Time: {} hours, {} minutes, {} seconds".format(hours, minutes, seconds))

Start Time:  2023-04-27 12:37:36
End Time:  2023-04-27 14:30:29
Running Time: 1 hours, 52 minutes, 53 seconds


In [16]:
# Get the validation accuracy for each fold
cv_results = cvModel.avgMetrics
for i, acc in enumerate(cv_results):
    print("Combination {}: Accuracy = {:.4f}".format(i+1, acc))

Combination 1: Accuracy = 0.4394
Combination 2: Accuracy = 0.4408
Combination 3: Accuracy = 0.4408
Combination 4: Accuracy = 0.4438
Combination 5: Accuracy = 0.4431
Combination 6: Accuracy = 0.4422
Combination 7: Accuracy = 0.4454
Combination 8: Accuracy = 0.4466
Combination 9: Accuracy = 0.4470
Combination 10: Accuracy = 0.4986
Combination 11: Accuracy = 0.4982
Combination 12: Accuracy = 0.4954
Combination 13: Accuracy = 0.5033
Combination 14: Accuracy = 0.5022
Combination 15: Accuracy = 0.5027
Combination 16: Accuracy = 0.5033
Combination 17: Accuracy = 0.5041
Combination 18: Accuracy = 0.5025
Combination 19: Accuracy = 0.5024
Combination 20: Accuracy = 0.5035
Combination 21: Accuracy = 0.5034
Combination 22: Accuracy = 0.5118
Combination 23: Accuracy = 0.5118
Combination 24: Accuracy = 0.5139
Combination 25: Accuracy = 0.5148
Combination 26: Accuracy = 0.5155
Combination 27: Accuracy = 0.5170


In [17]:
# Get the best model from the cross-validation
best_model = cvModel.bestModel

# Extract the parameter values from the best model
param_map = best_model.extractParamMap()

# Print the parameter values
for k, v in param_map.items():
    print("{} = {}".format(k.name, v))

bootstrap = True
cacheNodeIds = False
checkpointInterval = 10
featureSubsetStrategy = auto
featuresCol = embeddings
impurity = gini
labelCol = label_1
leafCol = 
maxBins = 64
maxDepth = 15
maxMemoryInMB = 256
minInfoGain = 0.0
minInstancesPerNode = 1
minWeightFractionPerNode = 0.0
numTrees = 150
predictionCol = prediction
probabilityCol = probability
rawPredictionCol = rawPrediction
seed = -1675970062277907489
subsamplingRate = 1.0


In [18]:
start_time = time.time()
# Make predictions on the test set
predictions = cvModel.transform(test_df)

# Record end time
end_time = time.time()

# Compute running time in seconds
running_time = end_time - start_time

# Convert running time to hours, minutes, and seconds
hours = int(running_time // 3600)
minutes = int((running_time % 3600) // 60)
seconds = int((running_time % 3600) % 60)

# Print start time, end time, and running time in hours, minutes, and seconds
print("Start Time: ", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time)))
print("End Time: ", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(end_time)))
print("Running Time: {} hours, {} minutes, {} seconds".format(hours, minutes, seconds))

Start Time:  2023-04-27 14:30:29
End Time:  2023-04-27 14:30:30
Running Time: 0 hours, 0 minutes, 0 seconds


In [19]:
#calculate classification accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label_1", predictionCol="prediction", metricName="accuracy")
evaluator.evaluate(predictions)

0.5097686375321336

In [20]:
# Instantiate the RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="label_1", predictionCol="prediction", metricName="mae")

# Calculate the MAE
evaluator.evaluate(predictions)

0.6835475578406169