In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=ae9cb55f1bcd674f798737266e3588842b809bdcbd7fb38a63c3ce4b3da29d5a
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("DataIngesttion") \
    .getOrCreate()

csv_file_path = "/content/social media.csv"
df_csv = spark.read.format("csv").option("header", "true").load(csv_file_path)
df_csv.show()

+-------+-------+--------------+-----------+---------------+---------------------+------------+--------------------+----------+------------------+--------------+----------------+
|post_id|user_id|      username|   location|followers_count|profile_creation_date|post_content|           post_date|  platform|sentiment_score_id|sentiment_type|confidence_score|
+-------+-------+--------------+-----------+---------------+---------------------+------------+--------------------+----------+------------------+--------------+----------------+
|    101|      1|      john_doe|   New York|            USA|                  500|  2022-01-15|Loving the new pr...|2024-09-18|           Twitter|             1|        positive|
|    102|      2|    jane_smith|     London|             UK|                 1200|  2020-06-25|This service is t...|2024-09-19|          Facebook|             2|        negative|
|    103|      3|    mike_brown|    Toronto|         Canada|                  900|  2021-05-10|The update

In [None]:
# @title Data preprocessing


In [6]:
###Drop Rows with Missing Values

# Drop rows with any null values
df_clean = df_csv.dropna()

# Drop rows with missing values only in specific columns (e.g., 'post_content' or 'sentiment_type')
df_clean = df_csv.dropna(subset=['post_content', 'sentiment_type'])

# Show the DataFrame after dropping missing values
df_clean.show()


+-------+-------+--------------+-----------+---------------+---------------------+------------+--------------------+----------+------------------+--------------+----------------+
|post_id|user_id|      username|   location|followers_count|profile_creation_date|post_content|           post_date|  platform|sentiment_score_id|sentiment_type|confidence_score|
+-------+-------+--------------+-----------+---------------+---------------------+------------+--------------------+----------+------------------+--------------+----------------+
|    101|      1|      john_doe|   New York|            USA|                  500|  2022-01-15|Loving the new pr...|2024-09-18|           Twitter|             1|        positive|
|    102|      2|    jane_smith|     London|             UK|                 1200|  2020-06-25|This service is t...|2024-09-19|          Facebook|             2|        negative|
|    103|      3|    mike_brown|    Toronto|         Canada|                  900|  2021-05-10|The update

In [7]:
###Fill Missing Values

# Fill missing values for numeric columns (e.g., 'followers_count', 'confidence_score') with 0
df_filled = df_csv.fillna({
    'followers_count': 0,
    'confidence_score': 0
})

# Fill missing values for string columns (e.g., 'post_content', 'username') with 'Unknown' or empty string
df_filled = df_filled.fillna({
    'post_content': 'No content',
    'username': 'Unknown'
})

# Show the DataFrame after filling missing values
df_filled.show()


+-------+-------+--------------+-----------+---------------+---------------------+------------+--------------------+----------+------------------+--------------+----------------+
|post_id|user_id|      username|   location|followers_count|profile_creation_date|post_content|           post_date|  platform|sentiment_score_id|sentiment_type|confidence_score|
+-------+-------+--------------+-----------+---------------+---------------------+------------+--------------------+----------+------------------+--------------+----------------+
|    101|      1|      john_doe|   New York|            USA|                  500|  2022-01-15|Loving the new pr...|2024-09-18|           Twitter|             1|        positive|
|    102|      2|    jane_smith|     London|             UK|                 1200|  2020-06-25|This service is t...|2024-09-19|          Facebook|             2|        negative|
|    103|      3|    mike_brown|    Toronto|         Canada|                  900|  2021-05-10|The update

In [8]:
####Handling Missing Values Condition-based



# Drop rows where 'post_content' or 'sentiment_score_id' is null
df_clean = df_csv.dropna(subset=['post_content', 'sentiment_score_id'])

# Fill missing 'followers_count' with 0 and 'sentiment_type' with 'unknown'
df_filled = df_clean.fillna({
    'followers_count': 0,
    'sentiment_type': 'unknown'
})

# Show the final DataFrame
df_filled.show()


+-------+-------+--------------+-----------+---------------+---------------------+------------+--------------------+----------+------------------+--------------+----------------+
|post_id|user_id|      username|   location|followers_count|profile_creation_date|post_content|           post_date|  platform|sentiment_score_id|sentiment_type|confidence_score|
+-------+-------+--------------+-----------+---------------+---------------------+------------+--------------------+----------+------------------+--------------+----------------+
|    101|      1|      john_doe|   New York|            USA|                  500|  2022-01-15|Loving the new pr...|2024-09-18|           Twitter|             1|        positive|
|    102|      2|    jane_smith|     London|             UK|                 1200|  2020-06-25|This service is t...|2024-09-19|          Facebook|             2|        negative|
|    103|      3|    mike_brown|    Toronto|         Canada|                  900|  2021-05-10|The update

In [None]:
#2.Real-Time Sentiment Analysis: Use PySpark to process real-time posts and compute sentiment scores for each post.

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize Spark session
spark = SparkSession.builder.appName("RealTimeSentimentAnalysis").getOrCreate()

# Load CSV data
csv_file_path = "/content/social media.csv"
df_csv = spark.read.format("csv").option("header", "true").load(csv_file_path)

# Clean and handle missing values
df_clean = df_csv.dropna(subset=['post_content', 'sentiment_score_id'])
df_filled = df_clean.fillna({'followers_count': 0, 'sentiment_type': 'unknown'})

# Download and initialize VADER sentiment analyzer
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

# Define UDF to compute sentiment
def compute_sentiment(post_content):
    score = sid.polarity_scores(post_content)
    if score['compound'] >= 0.05:
        return 'positive'
    elif score['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Register UDF
sentiment_udf = udf(compute_sentiment, StringType())

# Apply UDF to compute sentiment and create a new 'sentiment' column
df_with_sentiment = df_filled.withColumn("sentiment", sentiment_udf(df_filled["post_content"]))

# Select relevant columns and show the final DataFrame
df_final = df_with_sentiment.select("post_id", "post_content", "platform", "sentiment")
df_final.show()


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


+-------+------------+----------+---------+
|post_id|post_content|  platform|sentiment|
+-------+------------+----------+---------+
|    101|  2022-01-15|2024-09-18|  neutral|
|    102|  2020-06-25|2024-09-19|  neutral|
|    103|  2021-05-10|2024-09-20|  neutral|
|    104|  2019-11-20|2024-09-18|  neutral|
|    105|  2023-03-05|2024-09-19|  neutral|
|    106|  2018-07-14|2024-09-18|  neutral|
|    107|  2021-09-30|2024-09-20|  neutral|
|    108|  2020-12-12|2024-09-19|  neutral|
|    109|  2022-06-01|2024-09-18|  neutral|
|    110|  2020-04-18|2024-09-20|  neutral|
+-------+------------+----------+---------+



In [None]:
 # 3. Real-Time Trend Analysis: Calculate and update sentiment trends in real-time.

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, to_date, count
from pyspark.sql.types import StringType
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Initialize Spark session
spark = SparkSession.builder.appName("RealTimeSentimentTrend").getOrCreate()

# Load CSV data

csv_file_path = "/content/social media.csv"
df_csv = spark.read.format("csv").option("header", "true").load(csv_file_path)

# Initialize VADER
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

# UDF for sentiment analysis
def compute_sentiment(post_content):
    score = sid.polarity_scores(post_content)
    return 'positive' if score['compound'] >= 0.05 else 'negative' if score['compound'] <= -0.05 else 'neutral'

sentiment_udf = udf(compute_sentiment, StringType())

# Apply UDF and compute sentiment
df_with_sentiment = df_csv.withColumn("sentiment", sentiment_udf(df_csv["post_content"]))

# Add 'date' column based on 'post_date'
df_with_sentiment = df_with_sentiment.withColumn("date", to_date(df_with_sentiment["post_date"]))

# Group by date and sentiment type to calculate sentiment trends
df_trend = df_with_sentiment.groupBy("date", "sentiment").agg(count("post_id").alias("sentiment_count"))

# Show the trends
df_trend.show()


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


+----+---------+---------------+
|date|sentiment|sentiment_count|
+----+---------+---------------+
|NULL|  neutral|             10|
+----+---------+---------------+

