<a href="https://colab.research.google.com/github/tmate2/NLP_YouTube_comments/blob/master/NLP_YouTube_comments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Google API telepítése és a kliens inicializálása:

In [None]:
! pip install google-api-python-client

In [10]:
from googleapiclient.discovery import build
import pandas as pd


API_KEY ="api-key"

VIDEO_ID = "dQw4w9WgXcQ"

youtube = build("youtube", "v3", developerKey=API_KEY)

## YouTube kommentek letöltés

In [11]:
def get_comments(video_id, max_comments=500):
    comments = []
    next_page_token = None

    while len(comments) < max_comments:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=100,
            pageToken=next_page_token
        )
        response = request.execute()

        for item in response["items"]:
            comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
            comments.append(comment)

        next_page_token = response.get("nextPageToken")

        if not next_page_token:
            break
    if len(comments) > max_comments:
        return comments[:max_comments]

    return comments


df_comments = pd.DataFrame()
df_comments["comment"] = get_comments(VIDEO_ID, max_comments=200)

## Szövegtisztítás

In [278]:
from bs4 import BeautifulSoup

1. HTML tagek eltávolítása a BeautifulSoup segítségével:

In [279]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()


df_comments["comment"] = df_comments["comment"].apply(remove_html_tags)

2. Emojik eltávolítása:

In [None]:
! pip install emoji

In [280]:
import emoji


def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')


df_comments["comment"] = df_comments["comment"].apply(remove_emojis)

#### 3. Extra whitespace karakterek eltávolítása:

In [281]:
def remove_extra_whitespace(text):
    text = text.strip()
    text = " ".join(text.split())
    return text


df_comments["comment"] = df_comments["comment"].apply(remove_extra_whitespace)

#### 4. Linkek eltávolítása:

In [282]:
def remove_links(text):
    text_list = text.split(" ")
    text_list = [word for word in text_list if not word.startswith("http")]
    text = " ".join(text_list)
    return text


df_comments["comment"] = df_comments["comment"].apply(remove_links)

In [283]:
def remove_special_chars(text):
    denied_chars = "\"'<>#&@{}*:_÷~ˇ^§°/=`˙´˝¨¸|\\"
    return ''.join(char for char in text if not char in denied_chars)


df_comments["comment"] = df_comments["comment"].apply(remove_special_chars)

5. Tisztítás során keletkezett üres sorok törlése:

In [284]:
import numpy as np


def remove_empty_lines(df, column):
    df = df[df[column].astype(str).str.strip() != ""]
    df = df.replace(np.nan, '', regex=True)
    df = df[df[column] != '']
    return df


df_comments = remove_empty_lines(df_comments, "comment")

In [None]:
len(df_comments)

## Kommentek fordítása

In [None]:
! pip install transformers

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer


model_name = "Helsinki-NLP/opus-mt-mul-en"
MAX_LENGTH = 512

tokenizer = AutoTokenizer.from_pretrained(model_name)
translator = pipeline("translation", model=model_name)

#### Fordítás előszűrése:

In [287]:
def filter_comments_by_length(df, column, tokenizer, max_length):
    filtered_df = df.copy()
    indices_to_drop = []

    for index, row in df.iterrows():
        comment = row[column]
        tokens = tokenizer(comment)['input_ids']
        if len(tokens) > max_length:
            indices_to_drop.append(index)

    filtered_df = filtered_df.drop(indices_to_drop)
    return filtered_df


df_comments = filter_comments_by_length(df_comments, "comment", tokenizer, MAX_LENGTH)

#### A fordító:

In [248]:
def translate_to_english(text):
    try:
        translation = translator(text, max_length=MAX_LENGTH)
        translated_text = translation[0]["translation_text"]
        return translated_text
    except Exception as e:
        return ""

##### A fordító pipline tesztelése:

In [None]:
texts = [
    "Hi, how are you?",
    "My name is John Doe",
    "Szia, hogy vagy?",
    "Hola, ¿cómo estás?",       # Spanyol
    "Bonjour, comment ça va?",  # Francia
    "Hallo, wie geht es dir?",  # Német
]


for text in texts:
    english_translation = translate_to_english(text)
    print(f"Original: {text}")
    print(f"Translated: {english_translation}\n")

### Forditás végrehajtása:

In [None]:
from tqdm import tqdm

tqdm.pandas()

df_comments["translated_comment"] = df_comments["comment"].progress_apply(translate_to_english)

#### Üres sorok törlése:

In [291]:
df_comments = remove_empty_lines(df_comments, "translated_comment")

## Sentiment analízis

In [None]:
sentiment_analysis = pipeline("sentiment-analysis")

In [306]:
df_comments["sentiment"] = df_comments["translated_comment"].apply(lambda x: sentiment_analysis(x)[0]['label'])

In [None]:
count_positive = (df_comments['sentiment'] == 'POSITIVE').sum()
count_negative = (df_comments['sentiment'] == 'NEGATIVE').sum()

all = len(df_comments)

print(f"Positive count: {count_positive}/{all}")
print(f"Negative count: {count_negative}/{all}")