In [5]:
from pyspark.sql.functions import col, split, when
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF,IDF,StringIndexer
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# create a SparkSession object
spark = SparkSession.builder \
    .appName("MyApp") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
# load data

data = spark.read.csv("./IMDB Dataset.csv", header=True, inferSchema=True)

# prepare data
data = data.select("review", "sentiment")
data = data.withColumn("label", when(col("sentiment") == "positive", 1).otherwise(0))

# split data into training and test sets
train, test = data.randomSplit([0.6, 0.4], seed=42)

# extract features
hashing_tf = HashingTF(numFeatures=2**16, inputCol="words", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)
label_indexer = StringIndexer(inputCol="label", outputCol="label_indexed").fit(data)

# tokenize training data
train_words = train.select("review", "label").withColumn("words", split(col("review"), "\s+"))

# transform training data|
train_raw_features = hashing_tf.transform(train_words)
train_features = idf.fit(train_raw_features).transform(train_raw_features)
train_features = label_indexer.transform(train_features)

# train Naive Bayes model
nb = NaiveBayes(smoothing=1.0, modelType="multinomial", labelCol="label_indexed")
model = nb.fit(train_features)

# tokenize test data
test_words = test.select("review", "label").withColumn("words", split(col("review"), "\s+"))

# transform test data
test_raw_features = hashing_tf.transform(test_words)
test_features = idf.fit(test_raw_features).transform(test_raw_features)
test_features = label_indexer.transform(test_features)

# make predictions
predictions = model.transform(test_features)
predictions.show()

# evaluate model performance
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label_indexed", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = {:.2f}%".format(accuracy * 100))

# create a DataFrame with the reviews to be predicted
new_data = spark.read.csv("Predict Dataset.csv", header=True, inferSchema=True)

# transform new data
new_data_words = new_data.select("review").withColumn("words", split(col("review"), "\s+"))
new_data_raw_features = hashing_tf.transform(new_data_words)
new_data_features = idf.fit(new_data_raw_features).transform(new_data_raw_features)

# add sentiment analysis
new_data_predictions = model.transform(new_data_features)
new_data_predictions = new_data_predictions.withColumn("sentiment", when(col("prediction") == 1, "positive").otherwise("negative"))

# show predictions
new_data_predictions.show()


from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import ClusteringEvaluator


# vectorize the text data
assembler = VectorAssembler(inputCols=["features"], outputCol="vectorized_features")
train_vectorized = assembler.transform(train_features)

# train k-means model
kmeans = KMeans(k=2, seed=1)
model_kmeans = kmeans.fit(train_vectorized)

# assign cluster labels to the data
train_clustered = model_kmeans.transform(train_vectorized)

# display the cluster assignments
train_clustered.show()

# evaluate clustering performance
evaluator = ClusteringEvaluator(predictionCol="prediction", featuresCol="vectorized_features")
silhouette = evaluator.evaluate(train_clustered)
print("Silhouette Score = {:.2f}".format(silhouette))

+--------------------+-----+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+----------+
|              review|label|               words|         rawFeatures|            features|label_indexed|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+----------+
|""" Så som i himm...|    0|[""", Så, som, i,...|(65536,[1076,1981...|(65536,[1076,1981...|          0.0|[-1803.6744929212...|[0.99972107169651...|       0.0|
|"""A lot of the f...|    0|["""A, lot, of, t...|(65536,[1578,1981...|(65536,[1578,1981...|          0.0|[-360.48884114161...|[0.99999984816176...|       0.0|
|"""A wrong-doer i...|    0|["""A, wrong-doer...|(65536,[1714,5616...|(65536,[1714,5616...|          0.0|[-355.76194808462...|[0.99999991626398...|       0.0|
|"""Ally McBeal"" ...|    0|["""Ally, McBeal".