In [1]:
import re
import pandas as pd
import os

def clean_text(text):
    # Remove the specified word "[संगीत]" from the text
    text = re.sub(r'\[संगीत\]', '', text)
    # Remove all non-alphabetic characters (except spaces) and strip extra spaces
    return re.sub(r'[^ऀ-ॿ ]', '', text).strip()

def process_subtitle_file(srt_file_path):
    subtitles = []
    with open(srt_file_path, 'r', encoding='utf-8') as file:
        content = file.readlines()
    for i in range(0, len(content), 4):
        if i + 2 < len(content):  # Ensure there's a subtitle text line
            cleaned_line = clean_text(content[i + 2])
            if cleaned_line:  # Only add non-empty lines
                subtitles.append(cleaned_line)
    return subtitles

# Path to the folder containing the SRT files
subtitles_folder_path = 'subtitles'

# Initialize an empty list to store subtitles from all SRT files
all_subtitles = []

# Iterate through each SRT file in the subtitles folder
for filename in os.listdir(subtitles_folder_path):
    if filename.endswith(".srt"):
        srt_file_path = os.path.join(subtitles_folder_path, filename)
        subtitles = process_subtitle_file(srt_file_path)
        # Append subtitles from the current file to the all_subtitles list
        all_subtitles.extend(subtitles)

# Creating a DataFrame from all the cleaned subtitles
df_subtitles = pd.DataFrame(all_subtitles, columns=['Hindi'])

# Path where the combined CSV file will be saved
combined_csv_file_path = 'combined_subtitles.csv'

# Save the DataFrame to a single CSV file
df_subtitles.to_csv(combined_csv_file_path, index=False)

print("All subtitles have been processed and saved to a single CSV file.")

All subtitles have been processed and saved to a single CSV file.


In [None]:
# Load a sample of the file to inspect its content and structure
file_path = '/mnt/data/combined_subtitles.csv'
df_sample = pd.read_csv(file_path, nrows=100)  # Load a small sample for inspection

df_sample.info(), df_sample.head()

# Remove duplicates and perform basic text normalization on the 'Hindi' column.
# Text normalization includes trimming leading and trailing whitespaces and replacing multiple spaces with a single space.

# Remove duplicates
df_no_duplicates = df_sample.drop_duplicates()

# Normalize text: trim and reduce multiple spaces to one
df_no_duplicates['Hindi'] = df_no_duplicates['Hindi'].str.strip().replace('\s+', ' ', regex=True)

df_no_duplicates.info(), df_no_duplicates.head()

In [2]:
pip install --upgrade transformers

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
from transformers import pipeline

# Initialize the translation model pipeline
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-hi-en")

# Path to your large CSV file
csv_file_path = 'combined_subtitles.csv'

def translate_text(hindi_text):
    # Translate text and return the translation
    translation = translator(hindi_text, max_length=512)
    return translation[0]['translation_text']

def get_total_rows(file_path):
    # Efficiently count the total number of rows in the file
    with open(file_path, 'r', encoding='utf-8') as file:
        total_rows = sum(1 for _ in file) - 1  # Subtract 1 for the header
    return total_rows

def process_file(file_path, chunksize):
    # Process the CSV in chunks
    for chunk in pd.read_csv(file_path, chunksize=chunksize):
        chunk['English'] = chunk['Hindi'].apply(translate_text)
        # Append to a new file or overwrite. Adjust the mode and file name as needed.
        chunk.to_csv(file_path.replace('.csv', '_translated.csv'), mode='a', index=False, header=not Path(file_path.replace('.csv', '_translated.csv')).exists())

# Calculate the chunk size as 1/3rd of the total number of rows
total_rows = get_total_rows(csv_file_path)
chunk_size = max(1, total_rows // 3)  # Ensure at least 1 to avoid division by zero

process_file(csv_file_path, chunk_size)

In [None]:
df_subtitles

In [None]:
df_subtitles.to_csv('data.csv')

In [None]:
import random

def create_hinglish(hindi, english):
    english_words = english.split()
    hindi_words = hindi.split()
    # Decide the number of Hindi words to insert (up to 3 words or the length of the Hindi sentence)
    num_inserts = min(3, len(hindi_words))
    for _ in range(num_inserts):
        if english_words:  # Ensure there are English words left to insert into
            insert_pos = random.randint(0, len(english_words))
            hindi_word_to_insert = random.choice(hindi_words)
            english_words.insert(insert_pos, hindi_word_to_insert)
            # Optional: remove the Hindi word from future consideration (if you want each Hindi word used only once)
            hindi_words.remove(hindi_word_to_insert)
    return ' '.join(english_words)

# Apply the function to create a Hinglish column
df_subtitles['Hinglish'] = df_subtitles.apply(lambda row: create_hinglish(row['Hindi'], row['English']), axis=1)

# Displaying the DataFrame with the new Hinglish sentences
df_subtitles[['Hindi', 'English', 'Hinglish']].head()

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Load English stop words
stop_words = set(stopwords.words('english'))

def create_hinglish_based_on_stopwords(english, hindi):
    words = english.split()
    stop_word_indices = [index for index, word in enumerate(words) if word.lower() in stop_words]

    if not stop_word_indices:
        # No stop words found, return the original sentence (you can decide how to handle this case)
        return english

    # Find pivot index for Hindi insertion
    pivot_index = stop_word_indices[-1] + 1  # We split the sentence after the last stop word

    english_part = " ".join(words[:pivot_index])
    hindi_part = " ".join(hindi.split()[pivot_index:])  # Corresponding Hindi split

    # Combine the English and Hindi parts
    hinglish_sentence = english_part + ' ' + hindi_part
    return hinglish_sentence

# Apply the function to create the Hinglish sentences
df_subtitles['Hinglish'] = df_subtitles.apply(lambda row: create_hinglish_based_on_stopwords(row['English'], row['Hindi']), axis=1)

df_subtitles[['English', 'Hindi', 'Hinglish']].head()

In [None]:
from transformers import pipeline
def split_on_last_stopword(sentence, stop_words):
    words = sentence.split()
    last_stop_word_index = None
    for i, word in enumerate(words):
        if word.lower() in stop_words:
            last_stop_word_index = i

    if last_stop_word_index is not None and last_stop_word_index + 1 < len(words):
        return " ".join(words[:last_stop_word_index + 1]), " ".join(words[last_stop_word_index + 1:])
    else:
        return sentence, ""

# Splitting the English sentences and creating a new column for the part to be translated
df_subtitles['English_Part'] = ""
df_subtitles['To_Translate'] = ""

for index, row in df_subtitles.iterrows():
    english_part, to_translate = split_on_last_stopword(row['English'], stop_words)
    df_subtitles.at[index, 'English_Part'] = english_part
    df_subtitles.at[index, 'To_Translate'] = to_translate

# Initialize the translation model pipeline
translator = pipeline("translation_en_to_hi", model="Helsinki-NLP/opus-mt-en-hi")

def translate_to_hindi(text):
    if text.strip():  # Check if the text is not just empty or whitespace
        translation = translator(text, max_length=512)[0]['translation_text']
        return translation
    return ""
# Simulating the translation (here we just reverse the order of words to mimic translation)
df_subtitles['Translated'] = df_subtitles['To_Translate'].apply(translate_to_hindi)

# Combining the English part and the "translated" Hindi to form Hinglish sentences
df_subtitles['Hinglish'] = df_subtitles.apply(lambda row: row['English_Part'] + ' ' + row['Translated'], axis=1)

df_subtitles[['English', 'Hindi', 'English_Part', 'To_Translate', 'Translated', 'Hinglish']].head()

In [None]:
df_subtitles.tail()