In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, lower, trim, col
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, ClusteringEvaluator
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import PCA

# **1. Initialize Spark Session**

In [2]:
# 1. Initialize Spark Session
spark = SparkSession.builder \
    .appName("SentimentAnalysis") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "8g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .getOrCreate()


# **2. Load and Filter Dataset**

In [24]:
# 2. Load dataset (replace 'your_dataset.csv' with actual file path)
data = spark.read.csv('/content/IMDB Dataset.csv', header=True, inferSchema=True)

In [25]:
data.show()

+--------------------+--------------------+
|              review|           sentiment|
+--------------------+--------------------+
|One of the other ...|            positive|
|"A wonderful litt...| not only is it w...|
|"I thought this w...| but spirited you...|
|Basically there's...|            negative|
|"Petter Mattei's ...| power and succes...|
|"Probably my all-...| but that only ma...|
|I sure would like...|            positive|
|This show was an ...|            negative|
|Encouraged by the...|            negative|
|If you like origi...|            positive|
|"Phil the Alien i...|            negative|
|I saw this movie ...|            negative|
|"So im not a big ...| meaning most of ...|
|The cast played S...|            negative|
|This a fantastic ...|            positive|
|Kind of drawn in ...|            negative|
|Some films just s...|            positive|
|This movie made i...|            negative|
|I remember this f...|            positive|
|An awful film! It...|          

In [29]:
# Instead of data.info(), use the following methods to get information about the DataFrame:

# Print the schema:
data.printSchema()

# Get a summary of the DataFrame (similar to Pandas describe()):
data.summary().show()

# Check the number of rows:
data.count()

# Display some data:
data.show(5) # Shows the first 5 rows

root
 |-- review: string (nullable = true)
 |-- sentiment: string (nullable = true)

+-------+--------------------+--------------------+
|summary|              review|           sentiment|
+-------+--------------------+--------------------+
|  count|               30097|               30092|
|   mean|                NULL|              1240.0|
| stddev|                NULL|   1025.802960751096|
|    min|!!!! MILD SPOILER...|   "" If you need me|
|    25%|                NULL|                 0.0|
|    50%|                NULL|              1940.0|
|    75%|                NULL|              2000.0|
|    max|zero day is based...|you don't just ha...|
+-------+--------------------+--------------------+

+--------------------+--------------------+
|              review|           sentiment|
+--------------------+--------------------+
|One of the other ...|            positive|
|"A wonderful litt...| not only is it w...|
|"I thought this w...| but spirited you...|
|Basically there's...|    

In [4]:
# Check for null or missing values in the sentiment column
data = data.filter(data.sentiment.isNotNull())

In [27]:
data.schema

StructType([StructField('review', StringType(), True), StructField('sentiment', StringType(), True)])

In [5]:
# Ensure only valid sentiments are processed (assuming 'positive' and 'negative' are the valid labels)
valid_sentiments = ['positive', 'negative']
data = data.filter(col('sentiment').isin(valid_sentiments))

#**3. Data Preprocessing**

In [6]:

data = data.withColumn('review', trim(lower(regexp_replace(col('review'), '[^a-zA-Z\s]', ''))))

In [7]:
# Tokenization
tokenizer = Tokenizer(inputCol='review', outputCol='tokens')
data = tokenizer.transform(data)

In [8]:
# Remove stop words
stopwords_remover = StopWordsRemover(inputCol='tokens', outputCol='filtered_tokens')
data = stopwords_remover.transform(data)

In [9]:
# Convert text to features using CountVectorizer and TF-IDF
vectorizer = CountVectorizer(inputCol='filtered_tokens', outputCol='raw_features', vocabSize=3000)
vectorized_model = vectorizer.fit(data)
data = vectorized_model.transform(data)

In [10]:

idf = IDF(inputCol='raw_features', outputCol='features')
idf_model = idf.fit(data)
data = idf_model.transform(data)

In [11]:

# Reduce feature dimensionality with PCA
pca = PCA(k=50, inputCol='features', outputCol='pca_features')
pca_model = pca.fit(data)
data = pca_model.transform(data)

In [12]:
# Encode the labels
label_indexer = StringIndexer(inputCol='sentiment', outputCol='label')
data = label_indexer.fit(data).transform(data)

In [13]:
# Split data into training and test sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# **4. Supervised Learning Models**

# 4.1 Logistic Regression

In [14]:

lr = LogisticRegression(featuresCol='pca_features', labelCol='label')
lr_model = lr.fit(train_data)
lr_predictions = lr_model.transform(test_data)

In [15]:

# Evaluate Logistic Regression
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
lr_accuracy = evaluator.evaluate(lr_predictions)
print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")

Logistic Regression Accuracy: 0.7518


# 4.2 Gradient Boosting Trees

In [16]:

gbt = GBTClassifier(featuresCol='pca_features', labelCol='label', maxIter=10)
gbt_model = gbt.fit(train_data)
gbt_predictions = gbt_model.transform(test_data)
gbt_accuracy = evaluator.evaluate(gbt_predictions)
print(f"Gradient Boosting Trees Accuracy: {gbt_accuracy:.4f}")

Gradient Boosting Trees Accuracy: 0.7150


# 4.3 Multilayer Perceptron Classifier

In [17]:

layers = [50, 25, 10, 2]  # Input layer, two hidden layers, output layer
mlp = MultilayerPerceptronClassifier(featuresCol='pca_features', labelCol='label', layers=layers, maxIter=100)
mlp_model = mlp.fit(train_data)
mlp_predictions = mlp_model.transform(test_data)
mlp_accuracy = evaluator.evaluate(mlp_predictions)
print(f"Multilayer Perceptron Accuracy: {mlp_accuracy:.4f}")

Multilayer Perceptron Accuracy: 0.7174


# 4.4 Linear Support Vector Classifier

In [18]:

svc = LinearSVC(featuresCol='pca_features', labelCol='label', maxIter=10)
svc_model = svc.fit(train_data)
svc_predictions = svc_model.transform(test_data)
svc_accuracy = evaluator.evaluate(svc_predictions)
print(f"Linear SVC Accuracy: {svc_accuracy:.4f}")

Linear SVC Accuracy: 0.7494


# **5. Unsupervised Learning Models**


# 5.1 K-Means Clustering

In [19]:

kmeans = KMeans(featuresCol='pca_features', k=2, seed=42)
kmeans_model = kmeans.fit(data)
kmeans_predictions = kmeans_model.transform(data)
kmeans_evaluator = ClusteringEvaluator()
silhouette_score_kmeans = kmeans_evaluator.evaluate(kmeans_predictions)
print(f"K-Means Silhouette Score: {silhouette_score_kmeans:.4f}")

K-Means Silhouette Score: 0.5864


In [20]:
# Encode the labels
label_indexer_model = StringIndexer(inputCol='sentiment', outputCol='label2').fit(data)
data = label_indexer_model.transform(data)

# Retrieve label names
labels = label_indexer_model.labels

# **Evaluation**

In [21]:
from sklearn.metrics import classification_report, confusion_matrix

# Function to calculate and print evaluation metrics
def evaluate_model(predictions, label_col='label', prediction_col='prediction', labels=None):
    y_true = predictions.select(label_col).toPandas().to_numpy()
    y_pred = predictions.select(prediction_col).toPandas().to_numpy()
    print(classification_report(y_true, y_pred, target_names=labels))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))

In [22]:
# Logistic Regression Evaluation
print("\nLogistic Regression Metrics:")
evaluate_model(lr_predictions, label_col='label', prediction_col='prediction', labels=labels)

# Gradient Boosting Trees Evaluation
print("\nGradient Boosting Trees Metrics:")
evaluate_model(gbt_predictions, label_col='label', prediction_col='prediction', labels=labels)

# Multilayer Perceptron Evaluation
print("\nMultilayer Perceptron Metrics:")
evaluate_model(mlp_predictions, label_col='label', prediction_col='prediction', labels=labels)

# Linear SVC Evaluation
print("\nLinear SVC Metrics:")
evaluate_model(svc_predictions, label_col='label', prediction_col='prediction', labels=labels)


Logistic Regression Metrics:
              precision    recall  f1-score   support

    positive       0.77      0.78      0.78       224
    negative       0.73      0.72      0.72       183

    accuracy                           0.75       407
   macro avg       0.75      0.75      0.75       407
weighted avg       0.75      0.75      0.75       407

Confusion Matrix:
[[175  49]
 [ 52 131]]

Gradient Boosting Trees Metrics:
              precision    recall  f1-score   support

    positive       0.74      0.75      0.74       224
    negative       0.69      0.67      0.68       183

    accuracy                           0.71       407
   macro avg       0.71      0.71      0.71       407
weighted avg       0.71      0.71      0.71       407

Confusion Matrix:
[[168  56]
 [ 60 123]]

Multilayer Perceptron Metrics:
              precision    recall  f1-score   support

    positive       0.74      0.75      0.75       224
    negative       0.69      0.68      0.68       183

    