In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 40 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 44.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=b82210630f6dbbc5a0ab785c38449522be6fe64b37ad4cbe801aa8a044db78b2
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [None]:
# Import necessary libraries
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import Row

from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Practice').getOrCreate()

# Load and pre-process the training dataset
training = spark.read.json("/content/drive/MyDrive/Sarcasm_dataset/Sarcasm.json")
training = training.select(["headline", "is_sarcastic"])
training = training.withColumn("label", training["is_sarcastic"].cast("int"))

# Split the dataset into training and test sets
train, test = training.randomSplit([0.8, 0.2])

# Define the pipeline for feature extraction and model training
tokenizer = Tokenizer(inputCol="headline", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# Train the model using the training dataset
model = pipeline.fit(train)

# Evaluate the model using the test dataset
results = model.transform(test)
accuracy = results.filter(results.label == results.prediction).count() / results.count()
print("Accuracy:", accuracy)

# Save the model for future use
model.write().overwrite().save("/content/drive/MyDrive/Sarcasm_dataset/sarcasm_model")


Accuracy: 0.811487758945386


In [None]:
# Import necessary libraries
from pyspark.ml import Pipeline, PipelineModel

# Load the saved pipeline model
model = PipelineModel.load("/content/drive/MyDrive/Sarcasm_dataset/sarcasm_model")

# Use the model to make predictions on new data
predictions = model.transform(test)
predictions.show()

+--------------------+------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|            headline|is_sarcastic|label|               words|            features|       rawPrediction|         probability|prediction|
+--------------------+------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|"how do we allow ...|           0|    0|["how, do, we, al...|(262144,[4629,153...|[2.68697919679032...|[0.93625392996719...|       0.0|
|#badpicturemonday...|           0|    0|[#badpicturemonda...|(262144,[303,3888...|[6.61101429916212...|[0.99865634109710...|       0.0|
|#emojisinthewild ...|           0|    0|[#emojisinthewild...|(262144,[36998,75...|[2.50322494386495...|[0.92436759151578...|       0.0|
|#metoo and "legit...|           0|    0|[#metoo, and, "le...|(262144,[60775,18...|[4.74087786145573...|[0.99134459194827...|       0.0|
|#talktome: lucas ...|           0|    0|

In [None]:
sentence = input("Type a sentence: ")

df = spark.createDataFrame([(1,sentence)], ['Id','headline'])
result = model.transform(df).groupBy("prediction").mean().collect()[0].prediction
if result == 1:
  print("Sarcastic sentence!")
else:
  print("Normal sentence!")

Type a sentence: this alarm clock is great, it never rings.
Sarcastic sentence!


the cows went on strike because of hike in milk prices.


this alarm clock is great, it never rings.

