# Analiza sentymentu

## 1. Importy

In [1]:
import os
import json
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, concat_ws
from pyspark.sql.types import StringType, FloatType, ArrayType, DoubleType
import nltk
import google.generativeai as genai

## 2. Spark builder i NLTK Sentiment

In [3]:
spark = SparkSession.builder \
    .appName("RedditSentiment") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .getOrCreate()

In [4]:
sia = SentimentIntensityAnalyzer()

## 3. Ściagnięcie danych z HDFS

In [6]:
df = spark.read.json("hdfs://namenode:9000/data/reddit_data")

## 4. Analiza sentymentu posta i komentarzy

In [7]:
df_sentiment = df.withColumn("combined_text", concat_ws(". ", col("title"), col("selftext")))

def analyze_sentiment(text):
    if text:
        return float(sia.polarity_scores(text)["compound"])
    return 0.0

sentiment_udf = udf(analyze_sentiment, DoubleType())

df_sentiment = df_sentiment.withColumn("sentiment_score", sentiment_udf(col("combined_text")))

df_sentiment.select("title", "sentiment_score").show(10, truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+
|title                                                                                                                                                                                                                                                                                    |sentiment_score|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+
|Echo Valley (2025) - New on AppleTV+ w/ Julianne Moore and Sydney Sweeney                          

In [8]:
def analyze_comment_list(comments):
    try:
        if comments and isinstance(comments, list):
            scores = [sia.polarity_scores(c)["compound"] for c in comments if isinstance(c, str)]
            return float(sum(scores) / len(scores)) if scores else None
    except:
        return None

comment_sentiment_udf = udf(analyze_comment_list, DoubleType())
df_sentiment = df_sentiment.withColumn("comment_sentiment_score", comment_sentiment_udf(col("comments")))

## 5. Przykładowe wyniki i zapis do Parquet na HDFS

In [9]:
df_pd = df_sentiment.select("title","sentiment_score", "comment_sentiment_score").toPandas()
print("Przykładowe dane:")
print(df_pd.head(10))

Przykładowe dane:
                                               title  sentiment_score  \
0  Echo Valley (2025) - New on AppleTV+ w/ Julian...           0.8546   
1  From the World of John Wick: Ballerina Movie R...           0.9714   
2                                August in the water           0.9992   
3  Mission Impossible 3 - the one that saved the ...           0.9805   
4            Sinners - A Film That Dares You To Feel           0.7869   
5  Percy Jackson Films Absolutely Destroys the Se...          -0.7792   
6                                Beavis And Butthead          -0.9022   
7             From the World of John Wick: Ballerina           0.9571   
8                                           Sleepers           0.9939   
9    The Accountant 2: Uninspiring, Wasted Potential           0.9698   

   comment_sentiment_score  
0                      NaN  
1                      NaN  
2                      NaN  
3                  0.32710  
4                      NaN  
5   

In [10]:
df_sentiment.write.mode("overwrite").parquet("hdfs:///output/reddit_sentiment.parquet")

## Extract titles with Gemini and merge with movies

In [2]:
# ustaw swój Gemini API key w zmiennej środowiskowej lub podaj bezpośrednio
API_KEY = "AIzaSyDxA4gFJSIDTM4wqiCKMb6XcW6-6nO9nng"

genai.configure(api_key=API_KEY)

model = genai.GenerativeModel('gemini-1.5-flash')

prompt = """
Your task is to perform NER task on posts with film discussion from Reddit.
You have a reddit post which is written in a Markdown format. Analyse it and extract what is a title of a film that was discussed.
As an output give only the title of a film without any explanation or additional data.
If there are many films that were mentioned, choose only one, which is the main theme of the post.
Since post is in a markdown format, do not pay attention to links and any special signs, focus only on the text.

For example:
Input: "Ask me, what is a fucking best film I've ever watched and I'll say its a forest gamp. This film is way better than new Anora film. but ok maybe anora is also a good film idk"
Output: "Forest Gamp"

Input: "There are a lot of science fiction films like Star Wars, Arrival or Alien, but there is one which I consider the best. Interstellar is a masterpiece, the best science fiction which everyone should watch"
Output: "Interstellar"

Input: "I don't know why people watch films like this????? There are a lot of good musicals like La La Land, Wicked or Grease. For me Barbie its just another hyping bullshit that will be forgotten like in a 2 weeks."
Output: "Barbie"

Post: {post}
"""

def extract_film_title(post_text: str) -> str:
    filled_prompt = prompt.replace("{post}", post_text)
    response = model.generate_content([filled_prompt])
    
    content = response.candidates[0].content.parts[0].text.strip()
    
    return content

# Test
reddit_post = """
So I was just watching the classic film *Gentlemen Prefer Blondes* last night and found it ultimately works because the friendship between the two leading ladies is so solid and believable...
"""

title = extract_film_title(reddit_post)
print("Detected film title:", title)


Detected film title: Gentlemen Prefer Blondes


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("RedditSentiment") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .getOrCreate()

reddit_df = spark.read.parquet("hdfs:///output/reddit_sentiment.parquet")
reddit_df.show(3)

+--------+-------------------+-------+------------+-----+--------------------+------------+--------------------+--------------------+---------------+-----------------------+
|comments|        created_utc|     id|num_comments|score|            selftext|   subreddit|               title|       combined_text|sentiment_score|comment_sentiment_score|
+--------+-------------------+-------+------------+-----+--------------------+------------+--------------------+--------------------+---------------+-----------------------+
|      []|2025-06-13T12:16:55|1laefpm|           1|    1|I really liked th...|movieReviews|Echo Valley (2025...|Echo Valley (2025...|         0.8546|                   null|
|      []|2025-06-13T09:38:24|1labqgl|           1|    1|'m going to be ho...|movieReviews|From the World of...|From the World of...|         0.9714|                   null|
|      []|2025-06-13T00:47:22|1la2vp6|           1|    1|Director- Ishii G...|movieReviews| August in the water|August in the wat.

In [5]:
reddit_df.printSchema()

root
 |-- comments: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- created_utc: string (nullable = true)
 |-- id: string (nullable = true)
 |-- num_comments: long (nullable = true)
 |-- score: long (nullable = true)
 |-- selftext: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- title: string (nullable = true)
 |-- combined_text: string (nullable = true)
 |-- sentiment_score: double (nullable = true)
 |-- comment_sentiment_score: double (nullable = true)



In [9]:
import time
df = reddit_df.select("id", "combined_text").toPandas()

# 6. Iteruj batchowo z opóźnieniem
titles = []
for idx, row in df.iterrows():
    try:
        title = extract_film_title(row["combined_text"])
    except Exception as e:
        print(f"Error for id={row['id']}: {e}")
        title = ""
    titles.append(title)
    time.sleep(1)  # pauza 1s między wywołaniami - dostosuj do limitów API!

print("Wszystkie posty przeszły przez Gemini.")

Error for id=1l7wahs: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 16
}
]
Error for id=1l7vo1b: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "Gene

KeyboardInterrupt: 

In [11]:
missing = len(df) - len(titles)
if missing > 0:
    titles.extend([""] * missing)  # albo: [None] * missing jeśli chcesz null

# Teraz możesz bezpiecznie przypisać kolumnę
df["film_title"] = titles

print(df.head())

        id                                      combined_text  \
0  1laefpm  Echo Valley (2025) - New on AppleTV+ w/ Julian...   
1  1labqgl  From the World of John Wick: Ballerina Movie R...   
2  1la2vp6  August in the water. Director- Ishii Gakuryu.\...   
3  1l9wxt2  Mission Impossible 3 - the one that saved the ...   
4  1l9mb1t  Sinners - A Film That Dares You To Feel. Sinne...   

             film_title  
0           Echo Valley  
1             Ballerina  
2   August in the water  
3  Mission Impossible 3  
4               Sinners  


In [13]:
result_df = spark.createDataFrame(df)

result_df.write.mode("overwrite").parquet("hdfs:///output/reddit_with_titles.parquet")

print("Gotowe! Filmowe tytuły wyciągnięte i zapisane.")

Gotowe! Filmowe tytuły wyciągnięte i zapisane.


## Join Reddit and Movies

### Import all files

In [31]:
reddit_df = spark.read.parquet("hdfs:///output/reddit_sentiment.parquet")
film_titles_df = spark.read.parquet("hdfs:///output/reddit_with_titles.parquet")
movies_df = spark.read.parquet("hdfs:////output/movie_vectors.parquet")

### Join reddit sentiment with titles

In [32]:
clean_titles_df = film_titles_df.filter(
    (col("film_title").isNotNull()) & (col("film_title") != "")
)
reddit_with_titles = reddit_df.join(clean_titles_df.select("id", "film_title"), on="id", how="left")

### Normalize titles to join

In [33]:
from pyspark.sql.functions import col, lower, trim
reddit_with_titles = reddit_with_titles.withColumn("film_title_clean", lower(trim(col("film_title"))))
movies_df = movies_df.withColumn("movie_title_clean", lower(trim(col("clean_title"))))

### Join both together

In [34]:
print(reddit_with_titles.columns)

['id', 'comments', 'created_utc', 'num_comments', 'score', 'selftext', 'subreddit', 'title', 'combined_text', 'sentiment_score', 'comment_sentiment_score', 'film_title', 'film_title_clean']


In [35]:
reddit_subset = reddit_with_titles.select(
    "film_title_clean",
    "num_comments",
    "sentiment_score",
    "comment_sentiment_score"
)

In [36]:
final_df = reddit_subset.join(movies_df, reddit_subset.film_title_clean == movies_df.movie_title_clean, how="right")
final_df.show(5)

+----------------+------------+---------------+-----------------------+-------+--------------------+--------------------+--------------------+
|film_title_clean|num_comments|sentiment_score|comment_sentiment_score|movieId|         clean_title|      final_features|   movie_title_clean|
+----------------+------------+---------------+-----------------------+-------+--------------------+--------------------+--------------------+
|            null|        null|           null|                   null| 173921|    A Terrible Night|(154,[0,1,5,20,45...|    a terrible night|
|            null|        null|           null|                   null| 223750|Little Red Riding...|(154,[0,1,5,120],...|little red riding...|
|            null|        null|           null|                   null| 187327|   Avenue de l'opéra|(154,[0,1,5,56,11...|   avenue de l'opéra|
|            null|        null|           null|                   null| 267804|Miss Dundee and H...|(154,[0,1,5,107],...|miss dundee and h...|

In [37]:
empty_or_null_titles = reddit_subset.filter(
    (col("film_title_clean").isNull()) | (col("film_title_clean") == "")
)
empty_or_null_titles.count()

341

In [38]:
empty_or_null_titles.show(5)

+----------------+------------+---------------+-----------------------+
|film_title_clean|num_comments|sentiment_score|comment_sentiment_score|
+----------------+------------+---------------+-----------------------+
|            null|           1|          -0.97|                   null|
|            null|           1|        -0.9783|                   null|
|            null|           2|          0.721|                 0.9168|
|            null|           1|         0.9862|                   null|
|            null|           1|        -0.1984|                   null|
+----------------+------------+---------------+-----------------------+
only showing top 5 rows



### Save to parquet file

In [39]:
final_df.write.mode("overwrite").parquet("hdfs:///output/reddit_with_movies_joined.parquet")

In [41]:
print(final_df.columns)

['film_title_clean', 'num_comments', 'sentiment_score', 'comment_sentiment_score', 'movieId', 'clean_title', 'final_features', 'movie_title_clean']
