In [37]:
import os
import time
import re
import string
from typing import List

import numpy as np
import pandas as pd
from collections import Counter
from langdetect import detect, DetectorFactory, LangDetectException


In [38]:
#remove \n, \t & multiple spaces
def preprocessing_remove_newlines_tabs_and_spaces(text: str) -> str:
    text_cleaned = re.sub(r'[\n\t]+', ' ', text)
    text_final = re.sub(r'\s+', ' ', text_cleaned).strip()
    
    return text_final
    #all lower case
def preprocessing_lowercase(text: str) -> str:
     return text.lower()
    
def preprocessing_remove_punctuation(text: str) -> str:
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

def preprocess_combined(text: str) -> str:
    text_no_punctuation = preprocessing_remove_punctuation(text)
    text_cleaned = preprocessing_remove_newlines_tabs_and_spaces(text_no_punctuation)
    preprocessed = preprocessing_lowercase(text_cleaned) 
    return preprocessed
    

In [39]:
DetectorFactory.seed = 0 

def safe_detect_lang(text: str) -> str:
    if not isinstance(text, str):
        return "unknown"
    t = text.strip()
    if len(t) < 2:
        return "unknown"
    try:
        return detect(t)
    except LangDetectException:
        return "unknown"

In [40]:
input_dir  = r"comments\per_video"
output_dir = r"comments\preprocessed_comments"

os.makedirs(output_dir, exist_ok=True)

textOriginal_missing = 0
opening_error = 0
processed_files = 0

language_counter = Counter()

csv_files = [f for f in os.listdir(input_dir) if f.endswith(".csv")]
total_files = len(csv_files)
start_time = time.time()

for idx, filename in enumerate(sorted(csv_files)):
    if (idx + 1) % 1000 == 0 and total_files > 0:
        elapsed = time.time() - start_time
        est_total = (elapsed * total_files) / (idx + 1)
        est_remaining = max(est_total - elapsed, 0.0)
        print(f"Processed {idx + 1}/{total_files} files in {elapsed:.2f}s.")
        print(f"Estimated total: {est_total:.2f}s. Remaining: {est_remaining:.2f}s.")

    file_path = os.path.join(input_dir, filename)

    try:
        df_video = pd.read_csv(
            file_path,
            dtype=str,
            on_bad_lines="skip",
            low_memory=False,
            encoding="utf-8",
            encoding_errors="ignore",
        )

        if "textOriginal" in df_video.columns:
            texts = df_video["textOriginal"].fillna("")

            langs = [safe_detect_lang(x) for x in texts]
            df_video["lang"] = langs
            language_counter.update(langs)
            
            df_video["preprocessed_comment"] = [
                preprocess_combined(x) if isinstance(x, str) and x != "" else np.nan
                for x in texts
            ]
            if idx % 3000 == 0:
                print(df_video["preprocessed_comment"])
        else:
            textOriginal_missing += 1

        base = os.path.splitext(filename)[0]
        outname = f"{base}_preprocessed.csv"
        outpath = os.path.join(output_dir, outname)
        df_video.to_csv(outpath, index=False, encoding="utf-8")
        processed_files += 1

    except Exception:
        opening_error += 1

print("\nProcessing Summary:")
print("-------------------")
print(f"CSV files found: {total_files}")
print(f"Files successfully processed: {processed_files}")
print(f"Files with missing 'textOriginal' column: {textOriginal_missing}")
print(f"File opening errors: {opening_error}")

print("\nLanguage distribution across all comments:")
print("-----------------------------------------")
for code, count in language_counter.most_common():
    print(f"{code} = {count}")


0                         this is on of her best songs🌊
1     🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉🎉...
2                              nice ballad by lisa ajax
3                                        din röst wow❤️
4                  världens bästa människa älskar dig❤️
                            ...                        
95                                               winner
96                                     instrumental pls
97    12 points from belgium i think it is mind blow...
98               live is even better we love you lisa ❤
99    best song in melfest this year rooting for you...
Name: preprocessed_comment, Length: 100, dtype: object
Processed 1000/10286 files in 217.45s.
Estimated total: 2236.71s. Remaining: 2019.25s.
Processed 2000/10286 files in 453.45s.
Estimated total: 2332.10s. Remaining: 1878.65s.
Processed 3000/10286 files in 675.98s.
Estimated total: 2317.70s. Remaining: 1641.72s.
0                                                     ❤
1     hell y