In [2]:
!pip install tqdm
!pip install --upgrade transformers



In [3]:
import pandas as pd
import numpy as np
import random
import os
import re
import concurrent.futures
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from concurrent.futures import ThreadPoolExecutor, as_completed
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from tqdm.auto import tqdm

2024-03-28 13:21:30.769016: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Preprocessing Text
- Preprocessing the SRT file and combining them into one csv file based on thier video id and their filename

In [20]:
def clean_text(text):
    # Removing timestamps, formatting codes, and unwanted characters
    text = re.sub(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', '', text)  # Remove timestamps
    text = re.sub(r'<.*?>', '', text)  # Remove HTML-like tags if present
    text = re.sub(r'[,.\']', '', text)  # Removing commas, full stops, and apostrophes
    text = re.sub(r'\[Music\]|\[Applause\]|\[Laughter\]', '', text)  # Remove common noise words
    return re.sub(r'[^A-Za-z0-9 .,?!]', '', text).strip()

def process_subtitle_file(srt_file_path):
    subtitles = []
    with open(srt_file_path, 'r', encoding='utf-8') as file:
        content = file.read().split('\n\n')  # Splitting by subtitle blocks instead of lines
    for block in content:
        lines = block.split('\n')
        if len(lines) > 2:  # Ensuring there's a timestamp and at least one line of dialogue
            text = ' '.join(lines[2:])  # Joining dialogue lines (skipping the sequence number and timestamp)
            cleaned_text = clean_text(text)
            if cleaned_text and len(cleaned_text.split()) > 1:  # Checking for meaningful content
                subtitles.append([cleaned_text])
    return subtitles

def save_subtitles_to_csv(video_id, subtitles, subtitles_csv_dir):
    if subtitles:
        df_subtitles = pd.DataFrame(subtitles, columns=['English'])  # Changed column name to 'English'
        csv_file_path = os.path.join(subtitles_csv_dir, f"{video_id}.csv")
        df_subtitles.to_csv(csv_file_path, index=False)

def process_file(filename, subtitles_dir, subtitles_csv_dir):
    if filename.endswith(".srt"):
        video_id = filename[:-4]
        srt_file_path = os.path.join(subtitles_dir, filename)
        subtitles = process_subtitle_file(srt_file_path)
        save_subtitles_to_csv(video_id, subtitles, subtitles_csv_dir)

def read_video_ids(file_path):
    with open(file_path, 'r') as file:
        return file.read().splitlines()

def combine_csv_files(video_ids, output_file, subtitles_csv_dir):
    csv_frames = []
    for video_id in video_ids:
        csv_path = os.path.join(subtitles_csv_dir, f"{video_id}.csv")
        if os.path.exists(csv_path):
            df = pd.read_csv(csv_path)
            csv_frames.append(df)
    combined_df = pd.concat(csv_frames)
    combined_df.drop_duplicates(inplace=True)
    combined_df.to_csv(output_file, index=False)

def parallel_process_files(files, subtitles_dir, subtitles_csv_dir):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_file, filename, subtitles_dir, subtitles_csv_dir) for filename in files]
        concurrent.futures.wait(futures)

# Example usage setup remains the same
directories = {
    "video_ids_dir": 'video_ids',
    "subtitles_dir": 'subtitles',
    "subtitles_csv_dir": 'Subtitles_csv',
    "output_dir": 'Subtitles_csv_combined'
}

for dir_path in directories.values():
    os.makedirs(dir_path, exist_ok=True)

files = os.listdir(directories['subtitles_dir'])
parallel_process_files(files, directories['subtitles_dir'], directories['subtitles_csv_dir'])

for filename in os.listdir(directories['video_ids_dir']):
    if filename.endswith(".txt"):
        base_name = filename[:-4]
        output_filename = f"{base_name}_combined_data.csv"
        output_path = os.path.join(directories['output_dir'], output_filename)
        video_ids_path = os.path.join(directories['video_ids_dir'], filename)
        video_ids = read_video_ids(video_ids_path)
        combine_csv_files(video_ids, output_path, directories['subtitles_csv_dir'])

print("Process completed. CSV files have been combined based on video IDs and duplicates removed.")

Process completed. CSV files have been combined based on video IDs and duplicates removed.


- Randomly selecting 2000 english sentences from the csv file and combining into single csv file

In [21]:
# Function to count words in a sentence
def count_words(sentence):
    return len(sentence.split())

# Base directory containing your CSV files
directory_path = 'Subtitles_csv_combined'

# Placeholder for combined data
combined_data = pd.DataFrame(columns=['English'])

# Loop through all files in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(directory_path, file_name)
        
        # Read the file
        df = pd.read_csv(file_path)
        
        # Filter sentences with 5 or more words
        filtered_df = df[df['English'].apply(count_words) >= 5]
        
        # Combine
        combined_data = pd.concat([combined_data, filtered_df])

# If the combined data has more than 2000 sentences, randomly select 2000
if len(combined_data) > 2000:
    combined_data = combined_data.sample(n=2000, random_state=1).reset_index(drop=True)
else:
    combined_data = combined_data.sample(frac=1, random_state=1).reset_index(drop=True)  # Shuffle if less than 2000

# Save the combined data to a new CSV file
combined_file_path = 'combined_for_translation.csv'
combined_data.to_csv(combined_file_path, index=False)

print(f'Combined file saved to {combined_file_path}')

Combined file saved to combined_for_translation.csv


In [22]:
df = pd.read_csv("combined_for_translation.csv")
df.head()

Unnamed: 0,English
0,You know what else is intense? What?
1,but all our political parties do
2,Apka favourite pokemon kaunsa hai?
3,were going on a 100000 vacation
4,and secondly it will mean that


- Translating English Sentences into hindi sentences and saving them to the same file

In [23]:
# Initialize the translation pipeline for English to Hindi
translator = pipeline('translation', model='Helsinki-NLP/opus-mt-en-hi')

# Function to translate text
def translate_text(text):
    try:
        translation = translator(text, max_length=400)[0]['translation_text']
    except Exception as e:
        print(f"Error translating text: {e}")
        translation = ""
    return translation

# Parallel translation function
def parallel_translate(df, num_workers=5):
    texts = df['English'].tolist()  # Change column name to 'English' as it's the source language now
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        future_to_text = {executor.submit(translate_text, text): text for text in texts}
        for future in as_completed(future_to_text):
            text = future_to_text[future]
            try:
                result = future.result()
            except Exception as exc:
                print(f'{text} generated an exception: {exc}')
            else:
                df.loc[df['English'] == text, 'Hindi'] = result  # Update 'Hindi' column with translated text
    return df

# Path to your combined CSV file
file_path = 'combined_for_translation.csv'

# Read the combined CSV file
df = pd.read_csv(file_path)

# Ensure 'Hindi' column exists for the translated text
if 'Hindi' not in df.columns:
    df['Hindi'] = ''

# Translate in parallel and update the DataFrame
df_updated = parallel_translate(df)

# Save the updated DataFrame back to the same file
df_updated.to_csv(file_path, index=False)
print(f'Translation completed and saved back to {file_path}')

Translation completed and saved back to combined_for_translation.csv


In [None]:
import random

def create_hinglish(hindi, english):
    english_words = english.split()
    hindi_words = hindi.split()
    # Decide the number of Hindi words to insert (up to 3 words or the length of the Hindi sentence)
    num_inserts = min(3, len(hindi_words))
    for _ in range(num_inserts):
        if english_words:  # Ensure there are English words left to insert into
            insert_pos = random.randint(0, len(english_words))
            hindi_word_to_insert = random.choice(hindi_words)
            english_words.insert(insert_pos, hindi_word_to_insert)
            # Optional: remove the Hindi word from future consideration (if you want each Hindi word used only once)
            hindi_words.remove(hindi_word_to_insert)
    return ' '.join(english_words)

# Apply the function to create a Hinglish column
df_subtitles['Hinglish'] = df_subtitles.apply(lambda row: create_hinglish(row['Hindi'], row['English']), axis=1)

# Displaying the DataFrame with the new Hinglish sentences
df_subtitles[['Hindi', 'English', 'Hinglish']].head()

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Load English stop words
stop_words = set(stopwords.words('english'))

def create_hinglish_based_on_stopwords(english, hindi):
    words = english.split()
    stop_word_indices = [index for index, word in enumerate(words) if word.lower() in stop_words]

    if not stop_word_indices:
        # No stop words found, return the original sentence (you can decide how to handle this case)
        return english

    # Find pivot index for Hindi insertion
    pivot_index = stop_word_indices[-1] + 1  # We split the sentence after the last stop word

    english_part = " ".join(words[:pivot_index])
    hindi_part = " ".join(hindi.split()[pivot_index:])  # Corresponding Hindi split

    # Combine the English and Hindi parts
    hinglish_sentence = english_part + ' ' + hindi_part
    return hinglish_sentence

# Apply the function to create the Hinglish sentences
df_subtitles['Hinglish'] = df_subtitles.apply(lambda row: create_hinglish_based_on_stopwords(row['English'], row['Hindi']), axis=1)

df_subtitles[['English', 'Hindi', 'Hinglish']].head()

## Creating Hinglish Sentences

In [24]:
def select_and_translate_to_hindi(text, tokenizer, model):
    """Selects the last one, two, or three words of a sentence and translates them to Hindi."""
    words = text.split()
    # Randomly choose to take the last one, two, or three words
    num_words_to_translate = random.choice([1, 2, 3])
    words_to_translate = ' '.join(words[-num_words_to_translate:])
    rest_of_sentence = ' '.join(words[:-num_words_to_translate])
    
    # Translate the selected part to Hindi
    if words_to_translate.strip():  # Check if the text is not just empty or whitespace
        inputs = tokenizer(words_to_translate, return_tensors="pt", padding=True)
        outputs = model.generate(inputs["input_ids"], max_length=512, num_beams=4, early_stopping=True)
        translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    else:
        translation = ""
    
    # Return the original (minus the last word(s)) and the translated part
    return rest_of_sentence, translation

def translate_batch_to_hindi(batch, tokenizer, model):
    translations = []
    for text in batch:
        _, translated_text = select_and_translate_to_hindi(text, tokenizer, model)
        translations.append(translated_text)
    return translations


data = pd.read_csv("combined_for_translation.csv")

# Load your tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-hi")

# Prepare for translation
batch_size = 50  # Adjust based on your system
translated_texts = []
english_parts = []

with ThreadPoolExecutor() as executor:
    futures = []
    for i in tqdm(range(0, len(data['English']), batch_size), desc="Processing Batches"):
        batch = data['English'][i:i+batch_size].tolist()
        futures.append(executor.submit(translate_batch_to_hindi, batch, tokenizer, model))

    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Completing Translations"):
        translated_texts.extend(future.result())

# Combine English and translated parts to form Hinglish sentences
for idx, text in enumerate(data['English']):
    num_words_to_keep = random.choice([1, 2, 3])  # Adjust to match the translation selection
    words = text.split()
    english_part = ' '.join(words[:-num_words_to_keep])
    english_parts.append(english_part)

data['Hinglish'] = [e + ' ' + h if h else e for e, h in zip(english_parts, translated_texts)]

# Save to new CSV
data.to_csv('hinglish_sentences.csv', index=False)

Processing Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Completing Translations:   0%|          | 0/40 [00:00<?, ?it/s]