###SCRAPING & CASE FOLDING YT COMMENTS

####Installasi package

In [1]:
!pip install google-api-python-client pandas



####Library

In [2]:
from google.colab import drive
import pandas as pd
import re
from googleapiclient.discovery import build

####Configuration

In [3]:
API_KEY = "AIzaSyAI375F1_Xpg4efMENYwVS7haqHsK4yjpo"
VIDEO_ID = "tU9aRRxF8Jk"
COMMENT_LIMIT = 1200
OUTPUT_FILE = "YT_comments_histID.csv"

####Comments scraping

In [7]:
comments = []

youtube = build("youtube", "v3", developerKey=API_KEY)
request = youtube.commentThreads().list(
    part="snippet",
    videoId=VIDEO_ID,
    textFormat="plainText",
    maxResults=100
)

print(f"Start scraping Comments for Video ID: {VIDEO_ID}")
print(f"Minimum collection target: {COMMENT_LIMIT} comments.")

while request and len(comments) < COMMENT_LIMIT:
    try:
        response = request.execute()
    except Exception as e:
        print(f"Error retrieving data: {e}. Crawling stopped.")
        break

    for item in response.get("items", []):
        snippet = item["snippet"]["topLevelComment"]["snippet"]
        comment_data = {
            "author": snippet["authorDisplayName"],
            "comment_original": snippet["textDisplay"],
            "published_at": snippet["publishedAt"],
            "like_count": snippet["likeCount"],
        }
        comments.append(comment_data)

        if len(comments) >= COMMENT_LIMIT:
            break

    request = youtube.commentThreads().list_next(request, response)

print(f"✅ Scraping completed. Total {len(comments)} comments retrieved.")
df_comments = pd.DataFrame(comments)
df_comments.head()


Start scraping Comments for Video ID: tU9aRRxF8Jk
Minimum collection target: 1200 comments.
✅ Scraping completed. Total 997 comments retrieved.


Unnamed: 0,author,comment_original,published_at,like_count
0,@KokBisa,Pemerintah kita tahun ini ngumumin adanya penu...,2025-06-28T13:53:29Z,2002
1,@b4you71,Seandainya petrus ada lagi tapi kali ini pada ...,2025-10-10T06:58:32Z,0
2,@FADACT,"Keren kontennya seru, dan bermanfaat",2025-10-09T12:07:41Z,0
3,@IwanSetiawan-c9h,Menolak lupa tragedi semanggi dan tragedi tris...,2025-10-05T14:02:29Z,0
4,@AryaKussasi,suara lu nggak kedengeran bang,2025-10-02T13:41:44Z,0


####Case Folding and Cleaning Text

In [8]:
# Match 2 or more identical characters in a row and replace with one
def normalize_repeated_chars(text):
    return re.sub(r'([a-zA-Z])\1{2,}', r'\1', text)

def clean_text_pipeline(text):
    # Ensuring that the input is a string
    text = str(text)

    # 1. Remove URL
    # Pola RegEx for URL (HTTP, HTTPS, WWW)
    text = re.sub(r'http\S+|www.\S+', '', text)

    # 2. Case Folding
    text = text.casefold()

    # 3. Remove non-alphabetic characters / special symbols
    text = re.sub(r'[^a-z0-9\s]', ' ', text)

    # 4. Normalize repeated letters
    text = normalize_repeated_chars(text)

    # Remove double spaces and spaces at the beginning/end
    text = re.sub(r'\s+', ' ', text).strip()

    return text

df_comments["comment_clean"] = df_comments["comment_original"].apply(clean_text_pipeline)
df_comments.head()


Unnamed: 0,author,comment_original,published_at,like_count,comment_clean
0,@KokBisa,Pemerintah kita tahun ini ngumumin adanya penu...,2025-06-28T13:53:29Z,2002,pemerintah kita tahun ini ngumumin adanya penu...
1,@b4you71,Seandainya petrus ada lagi tapi kali ini pada ...,2025-10-10T06:58:32Z,0,seandainya petrus ada lagi tapi kali ini pada ...
2,@FADACT,"Keren kontennya seru, dan bermanfaat",2025-10-09T12:07:41Z,0,keren kontennya seru dan bermanfaat
3,@IwanSetiawan-c9h,Menolak lupa tragedi semanggi dan tragedi tris...,2025-10-05T14:02:29Z,0,menolak lupa tragedi semanggi dan tragedi tris...
4,@AryaKussasi,suara lu nggak kedengeran bang,2025-10-02T13:41:44Z,0,suara lu nggak kedengeran bang


####Save File

In [9]:
drive.mount('/content/drive', force_remount=True)
drive_path = f'/content/drive/MyDrive/PemrosesanTeksTeori/{OUTPUT_FILE}'
df_comments.to_csv(drive_path, index=False, encoding="utf-8")

Mounted at /content/drive
