In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession
sc = SparkContext("local[*]")
spark = SparkSession.builder.getOrCreate()

In [3]:
import numpy as np
from pyspark.sql import functions as F

# Data preprocessing

In [4]:
tFile="data\IMDB Dataset.csv.bz2"
df0 = spark.read.csv(tFile,header=True)
df0.show(10)

+--------------------+---------+
|                text|sentiment|
+--------------------+---------+
|One of the other ...| positive|
|A wonderful littl...| positive|
|I thought this wa...| positive|
|Basically there's...| negative|
|Petter Mattei's L...| positive|
|Probably my all-t...| positive|
|I sure would like...| positive|
|This show was an ...| negative|
|Encouraged by the...| negative|
|If you like origi...| positive|
+--------------------+---------+
only showing top 10 rows



In [5]:
#Sample the data for faster model training (use the full dataset in reality)
df0 = df0.sample(0.25, seed=200)

In [6]:
# Convert sentiment to numbers positive =1, negative =0
df0 = df0.withColumn("label", F.when(F.col("sentiment")=="positive",1).otherwise(0)).cache()

In [27]:
# Remove html tags from text
df0 = df0.withColumn("text_c", F.regexp_replace(F.col("text"), r'<[^>]+>', ""));
# Remove non-letters
df0 = df0.withColumn("text_c", F.regexp_replace("text_c", r"[^a-zA-Z ]", ""))
# Remove words 1, 2 char
df0 = df0.withColumn("text_c", F.regexp_replace("text_c", r"\b\w{1,2}\b", ""))
df0.toPandas().tail(5)

Unnamed: 0,text,sentiment,label,text_c
12643,"To be hones, I used to like this show and watc...",negative,0,hones used like this show and watch regul...
12644,This movie is a disgrace to the Major League F...,negative,0,This movie disgrace the Major League Franch...
12645,John Garfield plays a Marine who is blinded by...,positive,1,John Garfield plays Marine who blinded gre...
12646,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,0,Bad plot bad dialogue bad acting idiotic direc...
12647,I'm going to have to disagree with the previou...,negative,0,going have disagree with the previous comme...


### Lemmatization (optional)
Lemmatization is the process of reducing a word to its base or root form, which is also known as a lemma. The purpose of lemmatization is to simplify text and make it easier to analyze by grouping together different forms of the same word. For example, the words "running," "ran," and "runs" can all be reduced to the base form "run" through lemmatization. 

However, lemmatization can be a time-consuming operation, especially when dealing with large amounts of text data. This is because the process involves analyzing each word in a text and identifying its base form. It also requires a comprehensive understanding of the grammatical rules of a language to accurately identify the correct lemma for each word.

Despite its time-consuming nature, lemmatization can be a powerful tool in natural language processing and text analysis. It can help with tasks such as sentiment analysis, topic modeling, and text classification. When using lemmatization, it's important to use it carefully and correctly to ensure that the text is properly processed and analyzed.

In [58]:
import spacy
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Load the spaCy model
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])

# Define a function to apply the lemmatizer to a text
def lemmatize_text(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc]
    return " ".join(lemmas)

# Define a UDF to apply the lemmatizer to a column
lemmatize_udf = udf(lemmatize_text, StringType())

# Apply the UDF to a DataFrame column
df0 = df0.withColumn("text_c", lemmatize_udf(df0["text_c"]))

# Caching must be used !!!!!!
df0 = df0.cache()
df0.toPandas().tail(5)

Unnamed: 0,text,sentiment,label,text_c
12643,"To be hones, I used to like this show and watc...",negative,0,hones used like this show and watch r...
12644,This movie is a disgrace to the Major League F...,negative,0,this movie disgrace the major league fran...
12645,John Garfield plays a Marine who is blinded by...,positive,1,john garfield plays marine who blinded ...
12646,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,0,bad plot bad dialogue bad acting idiotic direc...
12647,I'm going to have to disagree with the previou...,negative,0,going have disagree with the previous co...


### Split the text to training and testing dataset

In [101]:
# Split the data in train and test (80%-20%)
df, test = df0.randomSplit(weights=[0.8,0.2], seed=200)
df.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|    1| 5016|
|    0| 5138|
+-----+-----+



### Assign weights to classes

In [61]:
# Create a weight of each class
from pyspark.sql import functions as F
p_weight = df.filter('label == 1').count()/ df.count()
n_weight = df.filter('label == 0').count()/ df.count()
print(n_weight, p_weight)

0.5060074847350797 0.4939925152649202


In [62]:
df = df.withColumn("weight", F.when(F.col("label")==1,n_weight).otherwise(p_weight))
df.show(5)

+--------------------+---------+-----+--------------------+------------------+
|                text|sentiment|label|              text_c|            weight|
+--------------------+---------+-----+--------------------+------------------+
| Så som i himmele...| positive|    1|   som   himmelen...|0.5060074847350797|
| While sporadical...| negative|    0|  while sporadica...|0.4939925152649202|
|'Blue Desert' may...| negative|    0|blue desert may h...|0.4939925152649202|
|'Checking Out' is...| positive|    1|checking out    e...|0.5060074847350797|
|'Presque Rien' ('...| positive|    1|presque rien come...|0.5060074847350797|
+--------------------+---------+-----+--------------------+------------------+
only showing top 5 rows



In [63]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import PCA
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import GBTClassifier

from pyspark.ml import Pipeline

# Data transformation

### Preprocessing pipeline
Pipelines in Spark are a powerful tool for data processing and analysis, as they enable the creation of complex data workflows that can be executed efficiently on distributed computing systems. They also simplify the data processing and analysis tasks, as they enable the chaining of multiple stages into a single workflow.

In [64]:
# Tokenize the review text
tokenizer = Tokenizer(inputCol="text_c", outputCol="words",)
# Remove stop words
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered")
# Create a count vectoriser
countVectorizer = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="rawFeatures", vocabSize=1000)
# Calculate the TF-IDF
idf = IDF(inputCol=countVectorizer.getOutputCol(), outputCol="featuresIDF")
# Crate a preprocessing pipeline wiht 4 stages
pipeline_p = Pipeline(stages=[tokenizer,remover, countVectorizer, idf])

In [65]:
# Learn the data preprocessing model
data_model = pipeline_p.fit(df)

In [66]:
# Transform
transformed_data = data_model.transform(df)
transformed_data.toPandas().head(5)

Unnamed: 0,text,sentiment,label,text_c,weight,words,filtered,rawFeatures,featuresIDF
0,Så som i himmelen .. as above so below.. tha...,positive,1,som himmelen above below that very ...,0.506007,"[, , , som, , , himmelen, , , , , above, , , b...","[, , , som, , , himmelen, , , , , , , special,...","(77.0, 0.0, 4.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","(0.015166436921203202, 0.0, 2.4572384294370635..."
1,While sporadically engrossing (including a fe...,negative,0,while sporadically engrossing including fe...,0.493993,"[, , while, sporadically, engrossing, includin...","[, , sporadically, engrossing, including, , , ...","(72.0, 0.0, 3.0, 3.0, 0.0, 3.0, 0.0, 0.0, 0.0,...","(0.014181603354891307, 0.0, 1.8429288220777975..."
2,'Blue Desert' may have had the potential to be...,negative,0,blue desert may have had the potential even...,0.493993,"[blue, desert, may, have, had, the, potential,...","[blue, desert, may, potential, , , , even, , ,...","(93.0, 2.0, 1.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0,...","(0.01831790433340127, 1.0085643367770025, 0.61..."
3,'Checking Out' is an extraordinary film that t...,positive,1,checking out extraordinary film that towers...,0.506007,"[checking, out, , , , extraordinary, film, tha...","[checking, , , , extraordinary, film, towers, ...","(40.0, 0.0, 3.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,...","(0.00787866853049517, 0.0, 1.8429288220777975,..."
4,'Presque Rien' ('Come Undone') is an earlier w...,positive,1,presque rien come undone earlier work the...,0.506007,"[presque, rien, come, undone, , , , earlier, w...","[presque, rien, come, undone, , , , earlier, w...","(232.0, 0.0, 6.0, 0.0, 3.0, 0.0, 0.0, 0.0, 8.0...","(0.04569627747687199, 0.0, 3.685857644155595, ..."


In [67]:
# Transfomr the test data
transformed_test = data_model.transform(test)
transformed_test.toPandas().head(5)

Unnamed: 0,text,sentiment,label,text_c,words,filtered,rawFeatures,featuresIDF
0,'Don't Look In the Basement' is so easy to kno...,positive,1,do nt look the basement easy knock but ...,"[do, nt, look, , , the, basement, , , , easy, ...","[nt, look, , , basement, , , , easy, , , knock...","(63.0, 0.0, 0.0, 4.0, 1.0, 1.0, 0.0, 0.0, 0.0,...","(0.012408902935529893, 0.0, 0.0, 2.05579531363..."
1,"*Flat SPOILERS* <br /><br />Five med students,...",positive,1,flat spoilers five med students nelson kiefer ...,"[flat, spoilers, five, med, students, nelson, ...","[flat, spoilers, five, med, students, nelson, ...","(67.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 2.0, 1.0,...","(0.01319676978857941, 0.5042821683885013, 0.0,..."
2,.... may seem far fetched.... but there really...,negative,0,may seem far fetched but there really was ...,"[, , may, seem, far, fetched, but, there, real...","[, , may, seem, far, fetched, really, , , real...","(69.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0,...","(0.013590703215104168, 0.0, 0.0, 0.0, 0.0, 0.7..."
3,...Our the grandpa's hour.<br /><br />More tha...,positive,1,our the grandpas hourmore than the gangsters i...,"[our, the, grandpas, hourmore, than, the, gang...","[grandpas, hourmore, gangsters, , , detailed, ...","(36.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0,...","(0.007090801677445653, 0.0, 0.6143096073592659..."
4,...but I regret having seen it. Since the rati...,negative,0,but regret having seen since the ratings ...,"[but, , , regret, having, seen, , , since, the...","[, , regret, seen, , , since, ratings, , , imd...","(75.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0,...","(0.014772503494678443, 0.0, 0.6143096073592659..."


In [68]:
# Print the sages of the pipeline
data_model.stages

[Tokenizer_c52fb5369e00,
 StopWordsRemover_e308724e010f,
 CountVectorizerModel: uid=CountVectorizer_6d30a193627e, vocabularySize=1000,
 IDFModel: uid=IDF_6e0660033640, numDocs=10154, numFeatures=1000]

In [69]:
# Get the vocabulary of the CountVectroizer
data_model.stages[2].vocabulary[:20]

['',
 'movie',
 'film',
 'nt',
 'one',
 'like',
 'good',
 'even',
 'time',
 'really',
 'see',
 'story',
 'much',
 'well',
 'get',
 'bad',
 'great',
 'also',
 'people',
 'first']

# Metics for the model

In [92]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.evaluation import MultilabelMetrics
from pyspark.mllib.evaluation import MulticlassMetrics

def m_metrics(ml_model,test_data):
    predictions = ml_model.transform(test_data).cache()
    predictionAndLabels = predictions.select("label","prediction").rdd.map(lambda x: (float(x[0]), float(x[1]))).cache()
    
    # Print some predictions vs labels
    print(predictionAndLabels.take(10))

    metrics = MulticlassMetrics(predictionAndLabels)
    
    # Overall statistics
    precision = metrics.precision(1.0)
    recall = metrics.recall(1.0)
    f1Score = metrics.fMeasure(1.0)
    print(f"Precision = {precision:.3f} Recall = {recall:.3f} F1 Score = {f1Score:.3f}")

    # Statistics by class
    labels = [0.0, 1.0]
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
    print("Confusion matrix \n", metrics.confusionMatrix().toArray().astype(int))

def m_metrics_l(ml_model,test_data):
    predictions = ml_model.transform(test_data).cache()
    predictionAndLabels = predictions.select("label","prediction").rdd.map(lambda x: (float(x[0]), float(x[1]))).cache()
    
    # Print some predictions vs labels
    # print(predictionAndLabels.take(10))
    metrics = MulticlassMetrics(predictionAndLabels)
    
    # Overall statistics
    precision = metrics.precision(1.0)
    recall = metrics.recall(1.0)
    f1Score = metrics.fMeasure(1.0)
    print(f"Precision = {precision:.4f} Recall = {recall:.4f} F1 Score = {f1Score:.4f}")
    print("Confusion matrix \n", metrics.confusionMatrix().toArray().astype(int))

# ML Model

In [107]:
import time
cassifier = LogisticRegression(maxIter=10, regParam=0.1, featuresCol = "featuresIDF", weightCol="weight")
start = time.time()
pipeline = Pipeline(stages=[cassifier])
print(f"Training started.")
model = pipeline.fit(transformed_data)
print(f"Model created in {time.time()-start:.2f}s.")
m_metrics_l(model,transformed_test)
print(f"Total time {time.time()-start:.2f}s.")

Training started.
Model created in 0.92s.
Precision = 0.8800 Recall = 0.8299 F1 Score = 0.8542
Confusion matrix 
 [[ 989  153]
 [ 230 1122]]
Total time 6.50s.


### GBTClassifier

In [119]:
cassifier = GBTClassifier(maxIter=10, featuresCol = "featuresIDF", weightCol="weight", maxDepth=5)
pipeline = Pipeline(stages=[cassifier])
start = time.time()
print(f"Training started.")
model = pipeline.fit(transformed_data)
print(f"Model created in {time.time()-start:.2f}s.")
m_metrics_l(model,transformed_test)
print(f"Total time {time.time()-start:.2f}s.")

Training started.
Model created in 3.65s.
Precision = 0.8729 Recall = 0.6837 F1 Score = 0.7668
Confusion matrix 
 [[ 704  162]
 [ 515 1113]]
Total time 9.24s.


### LinearSVC

In [122]:
cassifier = LinearSVC(maxIter=10, regParam=0.1, featuresCol = "featuresIDF", weightCol="weight")
pipeline = Pipeline(stages=[cassifier])
start = time.time()
print(f"Training started.")
model = pipeline.fit(transformed_data)
print(f"Model created in {time.time()-start:.2f}s.")
m_metrics_l(model,transformed_test)
print(f"Total time {time.time()-start:.2f}s.")

Training started.
Model created in 1.22s.
Precision = 0.8745 Recall = 0.8284 F1 Score = 0.8508
Confusion matrix 
 [[ 988  160]
 [ 231 1115]]
Total time 7.24s.


### MultilayerPerceptronClassifier

In [125]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
# The input layer must mathc the dimensionality of the input data currently = 1000
layers = [1000, 30, 2]

# create the trainer and set its parameters
cassifier = MultilayerPerceptronClassifier(maxIter=10, layers=layers,featuresCol = "featuresIDF", blockSize=128, seed=1234)
pipeline = Pipeline(stages=[cassifier])
start = time.time()
print(f"Training started.")
model = pipeline.fit(transformed_data)
print(f"Model created in {time.time()-start:.2f}s.")
m_metrics_l(model,transformed_test)
print(f"Total time {time.time()-start:.2f}s.")

Training started.
Model created in 1.52s.
Precision = 0.8722 Recall = 0.8305 F1 Score = 0.8508
Confusion matrix 
 [[ 992  163]
 [ 227 1112]]
Total time 7.29s.


# Features Selections
In Spark the following Feature Selectors are available
- VectorSlicer
- RFormula
- ChiSqSelector
- UnivariateFeatureSelector
- VarianceThresholdSelector

In [93]:
tokenizer = Tokenizer(inputCol="text_c", outputCol="words",)
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered")
countVectorizer = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="rawFeatures", vocabSize=1000)
idf = IDF(inputCol=countVectorizer.getOutputCol(), outputCol="featuresIDF")
selector = ChiSqSelector(numTopFeatures=200, featuresCol=idf.getOutputCol(), outputCol="features", labelCol="label")
# Crate a preprocessing pipeline wiht 5 stages
pipeline_p = Pipeline(stages=[tokenizer,remover, countVectorizer, idf,selector])
# Learn the data preprocessing model
data_model = pipeline_p.fit(df)

In [94]:
# Transform
transformed_data = data_model.transform(df)
transformed_data.toPandas().head(2)

Unnamed: 0,text,sentiment,label,text_c,weight,words,filtered,rawFeatures,featuresIDF,features
0,Så som i himmelen .. as above so below.. tha...,positive,1,som himmelen above below that very ...,0.506007,"[, , , som, , , himmelen, , , , , above, , , b...","[, , , som, , , himmelen, , , , , , , special,...","(77.0, 0.0, 4.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","(0.015166436921203202, 0.0, 2.4572384294370635...","(0.0, 0.513948828408791, 0.0, 0.0, 3.741203234..."
1,While sporadically engrossing (including a fe...,negative,0,while sporadically engrossing including fe...,0.493993,"[, , while, sporadically, engrossing, includin...","[, , sporadically, engrossing, including, , , ...","(72.0, 0.0, 3.0, 3.0, 0.0, 3.0, 0.0, 0.0, 0.0,...","(0.014181603354891307, 0.0, 1.8429288220777975...","(0.0, 1.541846485226373, 2.3927342982952755, 0..."


In [96]:
# Transfomr the test data
transformed_test = data_model.transform(test)
transformed_test.toPandas().head(2)

Unnamed: 0,text,sentiment,label,text_c,words,filtered,rawFeatures,featuresIDF,features
0,'Don't Look In the Basement' is so easy to kno...,positive,1,do nt look the basement easy knock but ...,"[do, nt, look, , , the, basement, , , , easy, ...","[nt, look, , , basement, , , , easy, , , knock...","(63.0, 0.0, 0.0, 4.0, 1.0, 1.0, 0.0, 0.0, 0.0,...","(0.012408902935529893, 0.0, 0.0, 2.05579531363...","(0.0, 2.055795313635164, 0.7975780994317585, 0..."
1,"*Flat SPOILERS* <br /><br />Five med students,...",positive,1,flat spoilers five med students nelson kiefer ...,"[flat, spoilers, five, med, students, nelson, ...","[flat, spoilers, five, med, students, nelson, ...","(67.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 2.0, 1.0,...","(0.01319676978857941, 0.5042821683885013, 0.0,...","(0.5042821683885013, 0.0, 0.7975780994317585, ..."


In [98]:
cassifier = LogisticRegression(maxIter=5, featuresCol = "features")
start = time.time()
pipeline = Pipeline(stages=[cassifier])
print(f"Training started.")
model = pipeline.fit(transformed_data)
print(f"Model created in {time.time()-start:.2f}s.")
m_metrics_l(model,transformed_test)
print(f"Total time {time.time()-start:.2f}s.")

Training started.
Model created in 0.85s.
Precision = 0.8620 Recall = 0.8105 F1 Score = 0.8354
Confusion matrix 
 [[ 962  176]
 [ 257 1099]]
Total time 6.41s.
