In [None]:
from pyspark.sql import SparkSession
spark_session = SparkSession.builder\
                .master("spark://192.168.2.5:7077")\
                .appName("Sentiment Analysis")\
                .config("spark.hadoop.fs.defaultFS", "hdfs://192.168.2.5:9000") \
                .config("spark.hadoop.dfs.replication", "1") \
                .config("spark.cores.max", 2)\
                .getOrCreate()

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from textblob import TextBlob

# Define a user-defined function to perform sentiment analysis on a text
def get_sentiment(text):
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity
    if sentiment > 0:
        return 'positive'
    elif sentiment < 0:
        return 'negative'
    else:
        return 'neutral'

# Register the UDF with PySpark
get_sentiment_udf = udf(get_sentiment, StringType())

# Load the Reddit comments dataset into a PySpark DataFrame
df = spark.read.json("")

# Add a new column to the DataFrame with the sentiment of each comment
df = df.withColumn("sentiment", get_sentiment_udf("body"))

# Group the comments by subreddit and sentiment, and count the number of comments
result = df.groupBy("subreddit", "sentiment").count()

# Show the top 10 subreddits by number of positive comments
result.filter(result.sentiment == "positive").orderBy(result["count"].desc()).show(10)