In [1]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=7c7f18a21795410a065b4461ef8f136f23592bf03c19cdddfcfc496592184201
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3


In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("DataIngesttion") \
    .getOrCreate()

csv_file_path = "/content/sample_data/post_fact.csv"
df_post = spark.read.format("csv").option("header", "true").load(csv_file_path)
df_post.show(10)

csv_file_path = "/content/sample_data/user_details.csv"
df_user = spark.read.format("csv").option("header", "true").load(csv_file_path)
df_user.show(10)



+-------+-------+--------------------+----------------+---------------+---------+
|post_id|user_id|           post_text|       post_date|sentiment_score| platform|
+-------+-------+--------------------+----------------+---------------+---------+
|      1|     44|Enjoyed a relaxin...|10-11-2024 05:43|            0.8|     NULL|
|      2|     30|The food was amaz...|06-12-2025 09:00|           -0.5|     NULL|
|      3|     16|Enjoyed a nice da...|10-08-2025 10:13|            0.5| Facebook|
|      4|      5|Had an okay day, ...|31-08-2025 16:08|            0.1| Facebook|
|      5|     34|The weather is pe...|15-10-2024 15:55|            0.3| Facebook|
|      6|     35|Excited about the...|13-07-2024 03:33|            0.2|     NULL|
|      7|     34|The weather is pe...|13-08-2024 14:47|           -0.2|     NULL|
|      8|     24|Enjoyed a relaxin...|15-09-2024 13:03|            0.4|Instagram|
|      9|     26|Excited about the...|29-03-2025 15:28|           -0.0|Instagram|
|     10|     34

# Data preprocessing

user field data is processed

In [6]:
from pyspark.sql.functions import when, trim, col

df_user = df_user.withColumn('user_name', when(trim(col('user_name')) == '', None).otherwise(trim(col('user_name'))))
df_user = df_user.withColumn('location', when(trim(col('location')) == '', None).otherwise(trim(col('location'))))

df_user = df_user.fillna({'user_name': 'Anonymous_user', 'location': 'Unknown'})


print("After changing the null value in user details")
df_user.show(10)


After changing the null value in user details
+-------+--------------+---------+-----------+
|user_id|     user_name| location|date_joined|
+-------+--------------+---------+-----------+
|      1|Anonymous_user|  Unknown| 15-01-2024|
|      2|Anonymous_user|Bangalore| 05-02-2024|
|      3|         Rahul|  Unknown| 10-03-2024|
|      4|         Sneha|  Chennai| 22-04-2024|
|      5|          Ravi|Hyderabad| 11-05-2024|
|      6|Anonymous_user|  Unknown| 01-06-2024|
|      7|         Sunil|  Unknown| 15-06-2024|
|      8|Anonymous_user|  Unknown| 05-07-2024|
|      9|Anonymous_user|  Unknown| 20-07-2024|
|     10|Anonymous_user|  Unknown| 10-08-2024|
+-------+--------------+---------+-----------+
only showing top 10 rows



Post field data is processed

In [5]:

df_post = df_post.withColumn('post_text', when(trim(col('post_text')) == '', None).otherwise(trim(col('post_text'))))
df_post = df_post.withColumn('platform', when(trim(col('platform')) == '', None).otherwise(trim(col('platform'))))

df_post = df_post.fillna({'post_text': 'No content', 'platform': 'Diff_source'})

print("After changing the null value in post details")
df_post.show(10)


After changing the null value in post details
+-------+-------+--------------------+----------------+---------------+-----------+
|post_id|user_id|           post_text|       post_date|sentiment_score|   platform|
+-------+-------+--------------------+----------------+---------------+-----------+
|      1|     44|Enjoyed a relaxin...|10-11-2024 05:43|            0.8|Diff_source|
|      2|     30|The food was amaz...|06-12-2025 09:00|           -0.5|Diff_source|
|      3|     16|Enjoyed a nice da...|10-08-2025 10:13|            0.5|   Facebook|
|      4|      5|Had an okay day, ...|31-08-2025 16:08|            0.1|   Facebook|
|      5|     34|The weather is pe...|15-10-2024 15:55|            0.3|   Facebook|
|      6|     35|Excited about the...|13-07-2024 03:33|            0.2|Diff_source|
|      7|     34|The weather is pe...|13-08-2024 14:47|           -0.2|Diff_source|
|      8|     24|Enjoyed a relaxin...|15-09-2024 13:03|            0.4|  Instagram|
|      9|     26|Excited about

droping the post text which has no content in it.

In [8]:
from pyspark.sql.functions import col


df_post = df_post.filter(col('post_text') != 'No content')

print("Data after permanently dropping rows where post_text is 'No content':")
df_post.show(15)


Data after permanently dropping rows where post_text is 'No content':
+-------+-------+--------------------+----------------+---------------+-----------+
|post_id|user_id|           post_text|       post_date|sentiment_score|   platform|
+-------+-------+--------------------+----------------+---------------+-----------+
|      1|     44|Enjoyed a relaxin...|10-11-2024 05:43|            0.8|Diff_source|
|      2|     30|The food was amaz...|06-12-2025 09:00|           -0.5|Diff_source|
|      3|     16|Enjoyed a nice da...|10-08-2025 10:13|            0.5|   Facebook|
|      4|      5|Had an okay day, ...|31-08-2025 16:08|            0.1|   Facebook|
|      5|     34|The weather is pe...|15-10-2024 15:55|            0.3|   Facebook|
|      6|     35|Excited about the...|13-07-2024 03:33|            0.2|Diff_source|
|      7|     34|The weather is pe...|13-08-2024 14:47|           -0.2|Diff_source|
|      8|     24|Enjoyed a relaxin...|15-09-2024 13:03|            0.4|  Instagram|
|     

Merging the Data

In [11]:

df_merged = df_user.join(df_post, on='user_id', how='inner')


print("Merged DataFrame:")
df_merged.show(5)

df_merged.write.csv('merged_data.csv', header=True, mode='overwrite')

print("Merged CSV file 'merged_data.csv' has been saved.")


Merged DataFrame:
+-------+--------------+---------+-----------+-------+--------------------+----------------+---------------+-----------+
|user_id|     user_name| location|date_joined|post_id|           post_text|       post_date|sentiment_score|   platform|
+-------+--------------+---------+-----------+-------+--------------------+----------------+---------------+-----------+
|     44|           Jay|  Chennai| 05-09-2025|      1|Enjoyed a relaxin...|10-11-2024 05:43|            0.8|Diff_source|
|     30|         Pooja|   Mumbai| 15-04-2025|      2|The food was amaz...|06-12-2025 09:00|           -0.5|Diff_source|
|     16|         Akash|  Unknown| 05-11-2024|      3|Enjoyed a nice da...|10-08-2025 10:13|            0.5|   Facebook|
|      5|          Ravi|Hyderabad| 11-05-2024|      4|Had an okay day, ...|31-08-2025 16:08|            0.1|   Facebook|
|     34|Anonymous_user|  Unknown| 25-05-2025|      5|The weather is pe...|15-10-2024 15:55|            0.3|   Facebook|
+-------+-----

In [10]:
df_merged.coalesce(1).write.csv('merged_data.csv', header=True, mode='overwrite')


Counting the Post by platform

In [12]:

post_count_by_platform = df_merged.groupBy('platform').count()
post_count_by_platform = post_count_by_platform.withColumnRenamed('count', 'post_count')
print("Number of posts by platform:")
post_count_by_platform.show()


Number of posts by platform:
+-----------+----------+
|   platform|post_count|
+-----------+----------+
|Diff_source|      2428|
|  Instagram|      2550|
|    Twitter|      2534|
|   Facebook|      2488|
+-----------+----------+



Filter Neutral  Sentiment Posts

In [15]:

neutral_posts = df_merged.filter(df_merged['sentiment_score'] == 0)
print("Neutral sentiment posts:")
neutral_posts.show()


Neutral sentiment posts:
+-------+--------------+---------+-----------+-------+--------------------+----------------+---------------+-----------+
|user_id|     user_name| location|date_joined|post_id|           post_text|       post_date|sentiment_score|   platform|
+-------+--------------+---------+-----------+-------+--------------------+----------------+---------------+-----------+
|     44|           Jay|  Chennai| 05-09-2025|      1|Enjoyed a relaxin...|10-11-2024 05:43|            0.8|Diff_source|
|     30|         Pooja|   Mumbai| 15-04-2025|      2|The food was amaz...|06-12-2025 09:00|           -0.5|Diff_source|
|     16|         Akash|  Unknown| 05-11-2024|      3|Enjoyed a nice da...|10-08-2025 10:13|            0.5|   Facebook|
|      5|          Ravi|Hyderabad| 11-05-2024|      4|Had an okay day, ...|31-08-2025 16:08|            0.1|   Facebook|
|     34|Anonymous_user|  Unknown| 25-05-2025|      5|The weather is pe...|15-10-2024 15:55|            0.3|   Facebook|
|     3

Negative post count by pplatform

In [14]:

negative_posts = df_merged.filter(df_merged['sentiment_score'] < 0)
negative_post_count_by_platform = negative_posts.groupBy('platform').count()

negative_post_count_by_platform = negative_post_count_by_platform.withColumnRenamed('count', 'negative_post_count')
print("Number of negative sentiment posts by platform:")
negative_post_count_by_platform.show()


Number of negative sentiment posts by platform:
+-----------+-------------------+
|   platform|negative_post_count|
+-----------+-------------------+
|Diff_source|                 70|
|  Instagram|                 58|
|    Twitter|                 86|
|   Facebook|                 67|
+-----------+-------------------+



#2.Real-Time Sentiment Analysis

In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

spark = SparkSession.builder.appName("RealTimeSentimentAnalysis").getOrCreate()


csv_file_path = "/content/social media.csv"
df_csv = spark.read.format("csv").option("header", "true").load(csv_file_path)

# Clean and handle missing values
df_clean = df_csv.dropna(subset=['post_content', 'sentiment_score_id'])
df_filled = df_clean.fillna({'followers_count': 0, 'sentiment_type': 'unknown'})

# Download and initialize VADER sentiment analyzer
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

# Define UDF to compute sentiment
def compute_sentiment(post_content):
    score = sid.polarity_scores(post_content)
    if score['compound'] >= 0.05:
        return 'positive'
    elif score['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Register UDF
sentiment_udf = udf(compute_sentiment, StringType())

# Apply UDF to compute sentiment and create a new 'sentiment' column
df_with_sentiment = df_filled.withColumn("sentiment", sentiment_udf(df_filled["post_content"]))
df_final = df_with_sentiment.select("post_id", "post_content", "platform", "sentiment")
df_final.show()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


+-------+------------+----------+---------+
|post_id|post_content|  platform|sentiment|
+-------+------------+----------+---------+
|    101|  2022-01-15|2024-09-18|  neutral|
|    102|  2020-06-25|2024-09-19|  neutral|
|    103|  2021-05-10|2024-09-20|  neutral|
|    104|  2019-11-20|2024-09-18|  neutral|
|    105|  2023-03-05|2024-09-19|  neutral|
|    106|  2018-07-14|2024-09-18|  neutral|
|    107|  2021-09-30|2024-09-20|  neutral|
|    108|  2020-12-12|2024-09-19|  neutral|
|    109|  2022-06-01|2024-09-18|  neutral|
|    110|  2020-04-18|2024-09-20|  neutral|
+-------+------------+----------+---------+



# 3. Real-Time Trend Analysis

In [17]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, to_date, count
from pyspark.sql.types import StringType
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

spark = SparkSession.builder.appName("RealTimeSentimentTrend").getOrCreate()

csv_file_path = "/content/social media.csv"
df_csv = spark.read.format("csv").option("header", "true").load(csv_file_path)
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

def compute_sentiment(post_content):
    score = sid.polarity_scores(post_content)
    return 'positive' if score['compound'] >= 0.05 else 'negative' if score['compound'] <= -0.05 else 'neutral'

sentiment_udf = udf(compute_sentiment, StringType())

df_with_sentiment = df_csv.withColumn("sentiment", sentiment_udf(df_csv["post_content"]))

# Add 'date' column based on 'post_date'
df_with_sentiment = df_with_sentiment.withColumn("date", to_date(df_with_sentiment["post_date"]))

# Group by date and sentiment type to calculate sentiment trends
df_trend = df_with_sentiment.groupBy("date", "sentiment").agg(count("post_id").alias("sentiment_count"))

# Show the trends
df_trend.show()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


+----+---------+---------------+
|date|sentiment|sentiment_count|
+----+---------+---------------+
|NULL|  neutral|             10|
+----+---------+---------------+

