In [5]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [6]:
import torch

if torch.cuda.is_available():
    print("GPU is available: ", torch.cuda.get_device_name(0))
else:
    print("GPU is not available, using CPU instead.")

GPU is not available, using CPU instead.


In [7]:
import re
import pandas as pd
import os
import concurrent.futures

def clean_text(text):
    text = re.sub(r'\[संगीत\]', '', text)
    return re.sub(r'[^ऀ-ॿ ]', '', text).strip()

def process_subtitle_file(srt_file_path):
    subtitles = []
    with open(srt_file_path, 'r', encoding='utf-8') as file:
        content = file.readlines()
    for i in range(0, len(content), 4):
        if i + 2 < len(content):
            cleaned_line = clean_text(content[i + 2])
            if cleaned_line and len(cleaned_line.split()) > 1:
                subtitles.append([cleaned_line])
    return subtitles

def save_subtitles_to_csv(video_id, subtitles, subtitles_csv_dir):
    if subtitles:
        df_subtitles = pd.DataFrame(subtitles, columns=['Hindi'])
        csv_file_path = os.path.join(subtitles_csv_dir, f"{video_id}.csv")
        df_subtitles.to_csv(csv_file_path, index=False)

def process_file(filename, subtitles_dir, subtitles_csv_dir):
    if filename.endswith(".srt"):
        video_id = filename[:-4]
        srt_file_path = os.path.join(subtitles_dir, filename)
        subtitles = process_subtitle_file(srt_file_path)
        save_subtitles_to_csv(video_id, subtitles, subtitles_csv_dir)

def read_video_ids(file_path):
    with open(file_path, 'r') as file:
        return file.read().splitlines()

def combine_csv_files(video_ids, output_file, subtitles_csv_dir):
    csv_frames = []
    for video_id in video_ids:
        csv_path = os.path.join(subtitles_csv_dir, f"{video_id}.csv")
        if os.path.exists(csv_path):
            df = pd.read_csv(csv_path)
            csv_frames.append(df)
    combined_df = pd.concat(csv_frames)
    combined_df.drop_duplicates(inplace=True)
    combined_df.to_csv(output_file, index=False)

def parallel_process_files(files, subtitles_dir, subtitles_csv_dir):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_file, filename, subtitles_dir, subtitles_csv_dir) for filename in files]
        concurrent.futures.wait(futures)

# Example usage
directories = {
    "video_ids_dir": 'video_ids',
    "subtitles_dir": 'subtitles',
    "subtitles_csv_dir": 'Subtitles_csv',
    "output_dir": 'Subtitles_csv_combined'
}

for dir_path in directories.values():
    os.makedirs(dir_path, exist_ok=True)

files = os.listdir(directories['subtitles_dir'])
parallel_process_files(files, directories['subtitles_dir'], directories['subtitles_csv_dir'])

for filename in os.listdir(directories['video_ids_dir']):
    if filename.endswith(".txt"):
        base_name = filename[:-4]
        output_filename = f"{base_name}_combined_data.csv"
        output_path = os.path.join(directories['output_dir'], output_filename)
        video_ids_path = os.path.join(directories['video_ids_dir'], filename)
        video_ids = read_video_ids(video_ids_path)
        combine_csv_files(video_ids, output_path, directories['subtitles_csv_dir'])

print("Process completed. CSV files have been combined based on video IDs and duplicates removed.")

Process completed. CSV files have been combined based on video IDs and duplicates removed.


In [3]:
import pandas as pd
import os

# Function to count words in a sentence
def count_words(sentence):
    return len(sentence.split())

# Base directory containing your CSV files
directory_path = 'Subtitles_csv_combined'

# Placeholder for combined data
combined_data = pd.DataFrame(columns=['Hindi'])

# Loop through all files in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(directory_path, file_name)
        
        # Read the file
        df = pd.read_csv(file_path)
        
        # Filter sentences with 5 or more words
        filtered_df = df[df['Hindi'].apply(count_words) >= 5]
        
        # Combine
        combined_data = pd.concat([combined_data, filtered_df])

# If the combined data has more than 2000 sentences, randomly select 2000
if len(combined_data) > 2000:
    combined_data = combined_data.sample(n=2000, random_state=1).reset_index(drop=True)
else:
    combined_data = combined_data.sample(frac=1, random_state=1).reset_index(drop=True)  # Shuffle if less than 2000

# Save the combined data to a new CSV file
combined_file_path = 'combined_for_translation.csv'
combined_data.to_csv(combined_file_path, index=False)

print(f'Combined file saved to {combined_file_path}')

Combined file saved to combined_for_translation.csv


In [8]:
pip install --upgrade transformers

Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
from transformers import pipeline
from concurrent.futures import ThreadPoolExecutor, as_completed

# Initialize the translation pipeline
translator = pipeline('translation', model='Helsinki-NLP/opus-mt-hi-en')

# Function to translate text
def translate_text(text):
    try:
        translation = translator(text, max_length=400)[0]['translation_text']
    except Exception as e:
        print(f"Error translating text: {e}")
        translation = ""
    return translation

# Parallel translation function
def parallel_translate(df, num_workers=5):
    texts = df['Hindi'].tolist()
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        future_to_text = {executor.submit(translate_text, text): text for text in texts}
        for future in as_completed(future_to_text):
            text = future_to_text[future]
            try:
                result = future.result()
            except Exception as exc:
                print(f'{text} generated an exception: {exc}')
            else:
                df.loc[df['Hindi'] == text, 'English'] = result
    return df

# Path to your combined CSV file
file_path = 'combined_for_translation.csv'

# Read the combined CSV file
df = pd.read_csv(file_path)

# Ensure 'English' column exists
df['English'] = ''

# Translate in parallel and update the DataFrame
df_updated = parallel_translate(df)

# Save the updated DataFrame back to the same file
df_updated.to_csv(file_path, index=False)
print(f'Translation completed and saved back to {file_path}')

Translation completed and saved back to combined_for_translation.csv


In [None]:
df_subtitles

In [None]:
df_subtitles.to_csv('data.csv')

In [None]:
import random

def create_hinglish(hindi, english):
    english_words = english.split()
    hindi_words = hindi.split()
    # Decide the number of Hindi words to insert (up to 3 words or the length of the Hindi sentence)
    num_inserts = min(3, len(hindi_words))
    for _ in range(num_inserts):
        if english_words:  # Ensure there are English words left to insert into
            insert_pos = random.randint(0, len(english_words))
            hindi_word_to_insert = random.choice(hindi_words)
            english_words.insert(insert_pos, hindi_word_to_insert)
            # Optional: remove the Hindi word from future consideration (if you want each Hindi word used only once)
            hindi_words.remove(hindi_word_to_insert)
    return ' '.join(english_words)

# Apply the function to create a Hinglish column
df_subtitles['Hinglish'] = df_subtitles.apply(lambda row: create_hinglish(row['Hindi'], row['English']), axis=1)

# Displaying the DataFrame with the new Hinglish sentences
df_subtitles[['Hindi', 'English', 'Hinglish']].head()

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Load English stop words
stop_words = set(stopwords.words('english'))

def create_hinglish_based_on_stopwords(english, hindi):
    words = english.split()
    stop_word_indices = [index for index, word in enumerate(words) if word.lower() in stop_words]

    if not stop_word_indices:
        # No stop words found, return the original sentence (you can decide how to handle this case)
        return english

    # Find pivot index for Hindi insertion
    pivot_index = stop_word_indices[-1] + 1  # We split the sentence after the last stop word

    english_part = " ".join(words[:pivot_index])
    hindi_part = " ".join(hindi.split()[pivot_index:])  # Corresponding Hindi split

    # Combine the English and Hindi parts
    hinglish_sentence = english_part + ' ' + hindi_part
    return hinglish_sentence

# Apply the function to create the Hinglish sentences
df_subtitles['Hinglish'] = df_subtitles.apply(lambda row: create_hinglish_based_on_stopwords(row['English'], row['Hindi']), axis=1)

df_subtitles[['English', 'Hindi', 'Hinglish']].head()

In [1]:
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from tqdm.auto import tqdm

# Define your stop words
stop_words = {"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
    "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers",
    "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves",
    "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are",
    "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does",
    "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until",
    "while", "of", "at", "by", "for", "with", "about", "against", "between", "into",
    "through", "during", "before", "after", "above", "below", "to", "from", "up", "down",
    "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here",
    "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more",
    "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so",
    "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now", "d",
    "ll", "m", "o", "re", "ve", "y", "ain", "aren", "couldn", "didn", "doesn", "hadn",
    "hasn", "haven", "isn", "ma", "mightn", "mustn", "needn", "shan", "shouldn", "wasn",
    "weren", "won", "wouldn"}  # Customize this list as needed

def split_on_last_stopword(sentence, stop_words):
    words = sentence.split()
    last_stop_word_index = None
    for i, word in enumerate(words):
        if word.lower() in stop_words:
            last_stop_word_index = i
    if last_stop_word_index is not None and last_stop_word_index + 1 < len(words):
        return " ".join(words[:last_stop_word_index + 1]), " ".join(words[last_stop_word_index + 1:])
    else:
        return sentence, ""

def translate_to_hindi(texts):
    tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
    model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
    translations = []
    for text in tqdm(texts, desc="Translating"):
        if text.strip():  # Check if the text is not just empty or whitespace
            inputs = tokenizer(text, return_tensors="pt", padding=True)
            outputs = model.generate(inputs["input_ids"], max_length=512, num_beams=4, early_stopping=True)
            translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
            translations.append(translation)
        else:
            translations.append("")
    return translations

# Load the CSV file
df_subtitles = pd.read_csv('combined_for_translation.csv')

# Split English sentences and prepare them for translation
df_subtitles['English_Part'], df_subtitles['To_Translate'] = zip(*df_subtitles['English'].apply(lambda x: split_on_last_stopword(x, stop_words)))

batch_size = 50  # Adjust based on your system
translated_texts = []

with ThreadPoolExecutor() as executor:
    futures = []
    for i in tqdm(range(0, len(df_subtitles['To_Translate']), batch_size), desc="Processing Batches"):
        batch = df_subtitles['To_Translate'][i:i+batch_size].tolist()
        futures.append(executor.submit(translate_to_hindi, batch))

    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Completing Translations"):
        translated_texts.extend(future.result())

# Update DataFrame with translated texts
df_subtitles['Translated'] = translated_texts

# Combine to form Hinglish sentences
df_subtitles['Hinglish'] = df_subtitles['English_Part'] + ' ' + df_subtitles['Translated']

# Save to new CSV
df_subtitles.to_csv('combined_for_translation.csv', index=False)

2024-03-25 21:34:02.452248: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Processing Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Completing Translations:   0%|          | 0/40 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

Translating:   0%|          | 0/50 [00:00<?, ?it/s]

In [3]:
import pandas as pd

df = pd.read_csv("combined_for_translation.csv") 

# Step 1: Filter out rows where English_Part and Hinglish are the same
df_filtered = df[df['English_Part'] != df['Hinglish']]

# Step 2: Drop unnecessary columns
df_final = df_filtered.drop(columns=['English_Part', 'To_Translate', 'Translated'])

# df_final now contains the cleaned dataset
df_final.to_csv('final_dataset.csv', index = False)