In [4]:
import os
import json
from collections import defaultdict
import pandas as pd
import warnings

warnings.filterwarnings("ignore")
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import emoji
import nltk
from num2words import num2words
from langdetect import detect, lang_detect_exception
from tqdm.notebook import tqdm
from transformers import pipeline

nltk.download("stopwords", quiet=True, raise_on_error=True, download_dir="../models/nltk_data")
nltk.download('punkt_tab', quiet=True, raise_on_error=True, download_dir="../models/nltk_data")
nltk.data.path.append("../models/nltk_data")

In [5]:
def save_json(data, file):
    with open(file, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
        
def load_json(file):
    with open(file, "r", encoding="utf-8") as f:
        return json.load(f)

def get_translation_model(from_lang, to_lang):
    models_dir = "../models"
    if not os.path.exists(models_dir):
        os.makedirs(models_dir)
    if not os.path.exists(f"{models_dir}/translation"):
        os.makedirs(f"{models_dir}/translation")
    if f"Helsinki-NLP/opus-mt-{from_lang}-{to_lang}" not in os.listdir(f"{models_dir}/translation"):
        return pipeline("translation", model=f"Helsinki-NLP/opus-mt-{from_lang}-{to_lang}")
    else:
        return pipeline(
            "translation",
            model=f"{models_dir}/translation/Helsinki-NLP/opus-mt-{from_lang}-{to_lang}",
        )


def save_translation_model(model, from_lang, to_lang):
    models_dir = "../models"
    if not os.path.exists(models_dir):
        os.makedirs(models_dir)
    if not os.path.exists(f"{models_dir}/translation"):
        os.makedirs(f"{models_dir}/translation")
    model.save_pretrained(f"{models_dir}/translation/Helsinki-NLP/opus-mt-{from_lang}-{to_lang}")

def clean_text(text):
    pattern = r"""
        <.*?> |                               # HTML tags
        \b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b |  # Emails
        (https?:\/\/|www\.)\S+ |              # URLs starting with http, https, or www
        \b\S+\.(com|org|net|ly|co|ly|pl|uk)\b    # Specific domains
    """
    text = re.sub(pattern, "", text, flags=re.VERBOSE)
    return text


def text_lowercase(text: str):
    return text.lower()


def demojize(text: str) -> str:
    return emoji.replace_emoji(text, "")


def convert_number(text: str) -> str:
    words = text.split()
    for word in words:
        if word.isdigit():
            try:
                words[words.index(word)] = num2words(word)
            except ValueError:
                continue
    return text


def remove_punctuation(text: str) -> str:
    return text.translate(str.maketrans("", "", string.punctuation))


def remove_whitespace(text: str) -> str:
    return " ".join(text.split())


def remove_stopwords(text: str) -> str:
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text)
    return " ".join(word for word in words if word not in stop_words)


def translate_text(text: str, from_lang, to_lang) -> str:
    if from_lang == to_lang:
        return text
    try:
        model = get_translation_model(from_lang, to_lang)
        translation = model(text)
        save_translation_model(model, from_lang, to_lang)
        return translation[0]["translation_text"]
    except Exception as e:
        print(f"Translation failed: {e}")
        return ""

def preprocess_text(text: str, lang: str = None) -> str:

    text = clean_text(text)

    text = text_lowercase(text)

    text = remove_punctuation(text)

    text = remove_whitespace(text)
    
    text = demojize(text)
    
    if lang:
        text = convert_number(text)
        
        text = translate_text(text, lang, "en")

        text = remove_stopwords(text)

    return text

In [11]:
entertainment_us = load_json("../data/videos/last/united-states/Entertainment.json")
entertainment_pl = load_json("../data/videos/last/poland/Entertainment.json")
film_animation_us = load_json("../data/videos/last/united-states/Film & Animation.json")
film_animation_pl = load_json("../data/videos/last/poland/Film & Animation.json")
howto_style_us = load_json("../data/videos/last/united-states/Howto & Style.json")
howto_style_pl = load_json("../data/videos/last/poland/Howto & Style.json")
people_blogs_us = load_json("../data/videos/last/united-states/People & Blogs.json")
people_blogs_pl = load_json("../data/videos/last/poland/People & Blogs.json")

categories = ["Entertainment", "Film & Animation", "Howto & Style", "People & Blogs"]
countries = ["us", "pl"]

for country in countries:
    for category in categories:
        print(f"Processing {category} in {country}")
        print("Num of channels: ", len(eval(f"{category.lower().replace(" & ", "_")}_{country}")))

Processing Entertainment in us
Num of channels:  545
Processing Film & Animation in us
Num of channels:  94
Processing Howto & Style in us
Num of channels:  94
Processing People & Blogs in us
Num of channels:  220
Processing Entertainment in pl
Num of channels:  423
Processing Film & Animation in pl
Num of channels:  62
Processing Howto & Style in pl
Num of channels:  98
Processing People & Blogs in pl
Num of channels:  263


In [None]:
for channel in tqdm(film_animation_pl, total=len(film_animation_pl), desc="Preprocessing film_animation_pl"):
    print(f"Processing channel {channel['channel']['name']}")
    for video in channel["videos"]:
        try:
            clean_title = preprocess_text(video["title"], "pl")
            video["title"] = clean_title
        except Exception as e:
            print(f"Error for channel {channel['channel']['name']}: {e}")
    film_animation_pl[film_animation_pl.index(channel)] = channel
save_json(film_animation_pl, "../data/videos/last/poland/Film & Animation.json")

In [None]:
for channel in tqdm(film_animation_us, total=len(film_animation_us), desc="Preprocessing film_animation_us"):
    for video in channel["videos"]:
        try:
            clean_title = preprocess_text(video["title"], "en")
            video["title"] = clean_title
        except Exception as e:
            print(f"Error for channel {channel['channel']['name']}: {e}")
    film_animation_us[film_animation_us.index(channel)] = channel
save_json(film_animation_us, "../data/videos/last/united-states/Film & Animation.json")

In [None]:
for channel in tqdm(entertainment_pl, total=len(entertainment_pl), desc="Preprocessing entertainment_pl"):
    for video in channel["videos"]:
        try:
            clean_title = preprocess_text(video["title"], "pl")
            video["title"] = clean_title
        except Exception as e:
            print(f"Error for channel {channel['channel']['name']}: {e}")
    entertainment_pl[entertainment_pl.index(channel)] = channel
save_json(entertainment_pl, "../data/videos/last/poland/Entertainment.json")

In [None]:
for channel in tqdm(entertainment_us, total=len(entertainment_us), desc="Preprocessing entertainment_us"):
    for video in channel["videos"]:
        try:
            clean_title = preprocess_text(video["title"], "en")
            video["title"] = clean_title
        except Exception as e:
            print(f"Error for channel {channel['channel']['name']}: {e}")
    entertainment_us[entertainment_us.index(channel)] = channel
save_json(entertainment_us, "../data/videos/last/united-states/Entertainment.json")

In [None]:
for channel in tqdm(howto_style_pl, total=len(howto_style_pl), desc="Preprocessing howto_style_pl"):
    for video in channel["videos"]:
        try:
            clean_title = preprocess_text(video["title"], "pl")
            video["title"] = clean_title
        except Exception as e:
            print(f"Error for channel {channel['channel']['name']}: {e}")
    howto_style_pl[howto_style_pl.index(channel)] = channel
save_json(howto_style_pl, "../data/videos/last/poland/Howto & Style.json")

In [None]:
for channel in tqdm(howto_style_us, total=len(howto_style_us), desc="Preprocessing howto_style_us"):
    for video in channel["videos"]:
        try:
            clean_title = preprocess_text(video["title"], "en")
            video["title"] = clean_title
        except Exception as e:
            print(f"Error for channel {channel['channel']['name']}: {e}")
    howto_style_us[howto_style_us.index(channel)] = channel
save_json(howto_style_us, "../data/videos/last/united-states/Howto & Style.json")

In [None]:
for channel in tqdm(people_blogs_pl, total=len(people_blogs_pl), desc="Preprocessing people_blogs_pl"):
    for video in channel["videos"]:
        try:
            clean_title = preprocess_text(video["title"], "pl")
            video["title"] = clean_title
        except Exception as e:
            print(f"Error for channel {channel['channel']['name']}: {e}")
    people_blogs_pl[people_blogs_pl.index(channel)] = channel
save_json(people_blogs_pl, "../data/videos/last/poland/People & Blogs.json")

In [None]:
for channel in tqdm(people_blogs_us, total=len(people_blogs_us), desc="Preprocessing people_blogs_us"):
    for video in channel["videos"]:
        try:
            clean_title = preprocess_text(video["title"], "en")
            video["title"] = clean_title
        except Exception as e:
            print(f"Error for channel {channel['channel']['name']}: {e}")
    people_blogs_us[people_blogs_us.index(channel)] = channel
save_json(people_blogs_us, "../data/videos/last/united-states/People & Blogs.json")