# Ściąganie danych z reddita i zapis do HDFS

## 1. Importy

In [2]:
import praw
import json
import os
import time
import subprocess
from datetime import datetime
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

## 2. Konfiguracja PRAW (Reddit API)

In [3]:
reddit = praw.Reddit(
    client_id="vxnv3nZG89ToczQ7AZQQFQ",
    client_secret="rbKY8djNzlrTF1tbBLl5AEB6M15S5Q",
    user_agent="reddit_data_scraper by /u/Purple_Talk_3536"
)

## 3. Funkcja pobierająca dane z Reddita

In [4]:
def fetch_reddit_posts(subreddit_name="movieReviews", pages=3, limit_per_page=50):
    subreddit = reddit.subreddit(subreddit_name)
    all_posts = []
    seen_ids = set()

    after = None
    for page in range(pages):
        posts = subreddit.new(limit=limit_per_page, params={"after": after})
        for submission in posts:
            if submission.id in seen_ids:
                continue

            submission.comments.replace_more(limit=0)
            comments = [
                comment.body
                for comment in submission.comments.list()
                if comment.author and comment.author.name != "AutoModerator"
            ]

            post = {
                "id": submission.id,
                "title": submission.title,
                "selftext": submission.selftext,
                "created_utc": datetime.utcfromtimestamp(submission.created_utc).isoformat(),
                "num_comments": submission.num_comments,
                "score": submission.score,
                "comments": comments,
                "subreddit": subreddit_name
            }
            all_posts.append(post)
            seen_ids.add(submission.id)
            after = submission.name
        time.sleep(1)

    return all_posts

## 4. Pobierz posty i zapisz do csv

In [5]:
posts = fetch_reddit_posts(pages=10, limit_per_page=50)
with open("reddit_data.json", "w", encoding="utf-8") as f:
    json.dump(posts, f, ensure_ascii=False, indent=2)

print("Dane zapisane do reddit_data.json")

Dane zapisane do reddit_data.json


## 5. Zapisz plik json do HDFS

In [11]:
#przekopiowac na dockera i z dockera na hdfs za pomoca hdfs put
spark = SparkSession.builder \
    .appName("Copy JSON to HDFS") \
    .config("spark.sql.shuffle.partitions", "8") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000/") \
    .getOrCreate()

df = spark.read.option("multiline", "true").json("reddit_data.json")
df.printSchema()

df.write.mode("overwrite").json("hdfs://namenode:9000/data/reddit_data")

root
 |-- comments: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- created_utc: string (nullable = true)
 |-- id: string (nullable = true)
 |-- num_comments: long (nullable = true)
 |-- score: long (nullable = true)
 |-- selftext: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- title: string (nullable = true)

