In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, RegexTokenizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Create a Spark session
spark = SparkSession.builder.appName("LogisticRegressionBollywood").getOrCreate()

# Load data into a Spark DataFrame
data = spark.read.csv("C:/Users/sharm/OneDrive/Desktop/desktop-2/big_data_managment/Sarcasm_Headlines_Dataset.csvm_v2.csv", header=True, inferSchema=True)
# Tokenize the headlines
tokenizer = Tokenizer(inputCol="headlines", outputCol="words")
data = tokenizer.transform(data)

# Remove stop words
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
data = stopwords_remover.transform(data)

# Apply TF (Term Frequency) vectorization
tf_vectorizer = CountVectorizer(inputCol="filtered_words", outputCol="raw_features")
tf_model = tf_vectorizer.fit(data)
data = tf_model.transform(data)

# Apply IDF (Inverse Document Frequency) to the term frequency vectors
idf = IDF(inputCol="raw_features", outputCol="features")
idf_model = idf.fit(data)
data = idf_model.transform(data)

# Select only necessary columns
data = data.select("is_sarcastic", "features")

# Split the data into training and testing sets
train_data, test_data = data.randomSplit([0.7, 0.3], seed=123)

# Create and train a Logistic Regression classifier
lr = LogisticRegression(labelCol="is_sarcastic", featuresCol="features")
model = lr.fit(train_data)

# Make predictions on the test data
predictions = model.transform(test_data)

# Evaluate the model using a MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="is_sarcastic", predictionCol="prediction", metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)

# Print the accuracy of the model
print(f"Accuracy: {accuracy}")

# Stop the Spark session
spark.stop()
