In [2]:
!pip install tqdm
!pip install --upgrade transformers



In [3]:
import pandas as pd
import numpy as np
import random
import os
import re
import concurrent.futures
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from concurrent.futures import ThreadPoolExecutor, as_completed
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from tqdm.auto import tqdm

2024-03-28 13:21:30.769016: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Preprocessing Text
- Preprocessing the SRT file and combining them into one csv file based on thier video id and their filename

In [20]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StringType
import re

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("SubtitlesProcessing") \
    .getOrCreate()

# Define the text cleaning UDF
def clean_text_udf(text):
    text = re.sub(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', '', text)  # Remove timestamps
    text = re.sub(r'<.*?>', '', text)  # Remove HTML-like tags if present
    text = re.sub(r'[,.\']', '', text)  # Remove commas, full stops, and apostrophes
    text = re.sub(r'\[Music\]|\[Applause\]|\[Laughter\]', '', text)  # Remove common noise words
    return re.sub(r'[^A-Za-z0-9 .,?!]', '', text).strip()  # Final clean-up

clean_text = F.udf(clean_text_udf, StringType())

hdfs_input_path = "hdfs://namenode:9000/subtitles/"
hdfs_output_path = "hdfs://namenode:9000/cleaned_subtitles/"

# Load the subtitle data
df_subtitles = spark.read.text(hdfs_input_path)

# Apply the cleaning function
df_cleaned = df_subtitles.withColumn("cleaned_content", clean_text("value"))

# Display the cleaned data
df_cleaned.show()

# Save the cleaned subtitles back to HDFS
df_cleaned.select("cleaned_content").write.text(hdfs_output_path)

spark.stop()

Process completed. CSV files have been combined based on video IDs and duplicates removed.


- Randomly selecting 2000 english sentences from the csv file and combining into single csv file

In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, rand
from pyspark.sql.types import IntegerType

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("ProcessSubtitles") \
    .getOrCreate()

# Define a User-Defined Function (UDF) for counting words in a sentence
@udf(returnType=IntegerType())
def count_words(sentence):
    return len(sentence.split())

# Base directory in HDFS containing  CSV files
directory_path = 'hdfs://namenode:9000/cleaned_subtitles'

# Read all CSV files from the directory into a DataFrame
df = spark.read.option("header", "true").csv(f"{directory_path}/*.csv")

# Filter sentences with 5 or more words using the UDF
filtered_df = df.filter(count_words(col("English")) >= 5)

# Randomly select 2000 sentences if more than 2000, or shuffle if fewer
count = filtered_df.count()

if count > 2000:
    sampled_df = filtered_df.sample(False, 2000 / count)
else:
    sampled_df = filtered_df.orderBy(rand())

#output path in HDFS for the processed data
output_path = 'hdfs://namenode:9000/combined_for_translation.csv'

# Saving the processed data back to HDFS
sampled_df.coalesce(1).write.option("header", "true").csv(output_path, mode="overwrite")

print(f'Combined file saved to {output_path}')

# Read the CSV file from HDFS
df = spark.read.option("header", "true").csv("hdfs://namenode:9000/combined_for_translation.csv")

# Show the first few rows
df.show()

# Stop the Spark session
spark.stop()

Combined file saved to combined_for_translation.csv


- Translating English Sentences into hindi sentences and saving them to the same file

In [23]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType
import pandas as pd
from transformers import pipeline

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("TranslationApp") \
    .getOrCreate()

# Define UDF for Translation
def translate_text_udf(text):
    # Initialize the translation pipeline 
    translator = pipeline('translation', model='Helsinki-NLP/opus-mt-en-hi')
    try:
        translation = translator(text, max_length=400)[0]['translation_text']
    except Exception as e:
        print(f"Error translating text: {e}")
        translation = ""
    return translation

translate_udf = udf(translate_text_udf, StringType())

input_path = 'hdfs://namenode:9000/combined_for_translation.csv'
output_path = 'hdfs://namenode:9000/combined_for_translation.csv'

# Read the data
df = spark.read.option("header", "true").csv(input_path)

# Translate text
translated_df = df.withColumn("Hindi", translate_udf(col("English")))

# Saving the translated data back to HDFS
translated_df.write.option("header", "true").csv(output_path, mode="overwrite")

print(f'Translation completed and saved back to {output_path}')

# Stop the Spark session
spark.stop()

Translation completed and saved back to combined_for_translation.csv


## Creating Hinglish Sentences

In [24]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StringType
import random
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("HinglishTranslationLargeData") \
    .getOrCreate()

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-hi")

def select_and_translate(text):
    """Randomly selects 1-3 words from the end of `text`, translates them, and combines with the remaining English sentence."""
    # Split the text and select the last 1-3 words
    words = text.split()
    num_words_to_translate = random.choice([1, 2, 3])
    words_to_translate = ' '.join(words[-num_words_to_translate:])
    rest_of_sentence = ' '.join(words[:-num_words_to_translate])
    
    # Simulate translation (Replace this with actual translation logic)
    inputs = tokenizer(words_to_translate, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(inputs["input_ids"], max_length=40, num_beams=4, early_stopping=True)
    translated_part = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Combine the untranslated part with the translated part
    hinglish_sentence = rest_of_sentence + ' ' + translated_part if translated_part else text
    return hinglish_sentence

# Define UDF for Spark to apply the translation function
translate_udf = F.udf(select_and_translate, StringType())

# Define paths for reading from and writing to HDFS
input_path = 'hdfs://namenode:9000/combined_for_translation.csv'
output_path = 'hdfs://namenode:9000/hinglish_sentences'

# Read the data into a Spark DataFrame
df = spark.read.option("header", "true").csv(input_path)

# Apply the UDF to translate and create Hinglish sentences
df_with_hinglish = df.withColumn("Hinglish", translate_udf(F.col("English")))

# Save the DataFrame with Hinglish sentences back to HDFS
df_with_hinglish.write.option("header", "true").csv(output_path, mode="overwrite")

print(f"Translation completed and saved back to {output_path}")

# Stop the Spark session
spark.stop()

Processing Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Completing Translations:   0%|          | 0/40 [00:00<?, ?it/s]