In [1]:
import pandas as pd

df = pd.read_csv('/kaggle/input/eng-hing/train.txt', names=['en', 'hing'], usecols=['en', 'hing'], sep='\t')
df = df.sample(frac=1, random_state=42)
df = df.reset_index(drop=True)


In [2]:
df

Unnamed: 0,en,hing
0,How to play with other people,other people के साथ कैसे खेलें
1,One Hundred Years of Solitude,Solitude के सौ Years
2,Take this route with the help of a local person,किसी स्थानीय व्यक्ति की help से इसी route को प...
3,for the People of the Right Hand,दाहिने हाथ में नामए आमाल लेने People के वास्ते है
4,And those who are fearful of their Lord s doom,और जो लोग अपने Lord के doom से fearful हैं
...,...,...
248324,to lasting friendship between India and Mozamb...,India और मोजाम्बिक के बीच स्थायी मैत्री की काम...
248325,Google Drive cannot be reached at this time,इस time Google Drive तक नहीं पहुंचा जा सकता
248326,Here this is my new sweetheart,यहाँ यह मेरी new sweetheart है
248327,Let them remember that nature is the finest ph...,उन्हें यह याद रखना चाहिये कि finest physician ...


In [3]:
import numpy as np
import re
from unicodedata import normalize


def clean_text(text, language='en'):
    if isinstance(text, float) and np.isnan(text):  # Check if text is NaN
        return ''  # Return empty string for NaN values
    text = normalize('NFD', text)
    if language == 'en':
        text = re.sub('[^A-Za-z .\']+', '', text)
    elif language == 'hing': 
        text = re.sub('[^\u0900-\u097F A-Za-z .\']+', '', text)
    return text

def clean_and_prepare_text(text, language='hing'):
    text = '[start] ' + clean_text(text, language=language) + ' [end]'
    return text

# Apply it to your dataframe like this:
df['en'] = df['en'].apply(lambda row: clean_text(row, language='en'))
df['hing'] = df['hing'].apply(lambda row: clean_and_prepare_text(row, language='hing'))
df


Unnamed: 0,en,hing
0,How to play with other people,[start] other people के साथ कैसे खेलें [end]
1,One Hundred Years of Solitude,[start] Solitude के सौ Years [end]
2,Take this route with the help of a local person,[start] किसी स्थानीय व्यक्ति की help से इसी ro...
3,for the People of the Right Hand,[start] दाहिने हाथ में नामए आमाल लेने People क...
4,And those who are fearful of their Lord s doom,[start] और जो लोग अपने Lord के doom से fearful...
...,...,...
248324,to lasting friendship between India and Mozamb...,[start] India और मोजाम्बिक के बीच स्थायी मैत्र...
248325,Google Drive cannot be reached at this time,[start] इस time Google Drive तक नहीं पहुंचा जा...
248326,Here this is my new sweetheart,[start] यहाँ यह मेरी new sweetheart है [end]
248327,Let them remember that nature is the finest ph...,[start] उन्हें यह याद रखना चाहिये कि finest ph...


In [4]:
en = df['en']
hing = df['hing']

en_max_len = max(len(line.split()) for line in en)
hing_max_len = max(len(line.split()) for line in hing)
sequence_len = max(en_max_len, hing_max_len)


In [5]:
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# en_tokenizer = Tokenizer()
# en_tokenizer.fit_on_texts(en)
# en_sequences = en_tokenizer.texts_to_sequences(en)
# en_x = pad_sequences(en_sequences, maxlen=sequence_len, padding='post')

# hing_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@\\^_`{|}~\t\n')
# hing_tokenizer.fit_on_texts(hing)
# hing_sequences = fr_tokenizer.texts_to_sequences(hing)
# hing_y = pad_sequences(hing_sequences, maxlen=sequence_len + 1, padding='post')

from collections import defaultdict
import torch
class Tokenizer:
    def __init__(self):
        self.word_index = {}
        self.index_word = {}
    
    def fit_on_texts(self, texts):
        word_freq = defaultdict(int)
        for text in texts:
            for word in text.split():
                word_freq[word] += 1
        self.word_index = {word: i+1 for i, (word, freq) in enumerate(sorted(word_freq.items(), key=lambda x: x[1], reverse=True))}
        self.index_word = {i: word for word, i in self.word_index.items()}

    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            seq = [self.word_index.get(word, 0) for word in text.split()]
            sequences.append(seq)
        return sequences
    
    def sequences_to_texts(self, sequences):
        texts = []
        for sequence in sequences:
            words = [self.index_word.get(idx, "<unk>") for idx in sequence if idx > 0]  # Skip padding
            text = " ".join(words)
            texts.append(text)
        return texts

def pad_sequences(sequences, maxlen, padding='post'):
    max_seq_len = max(len(seq) for seq in sequences)
    padded_seqs = torch.zeros((len(sequences), maxlen), dtype=torch.long)
    for i, seq in enumerate(sequences):
        if padding == 'post':
            padded_seqs[i, :len(seq)] = torch.tensor(seq[:maxlen], dtype=torch.long)
        else:  
            padded_seqs[i, -len(seq):] = torch.tensor(seq[-maxlen:], dtype=torch.long)
    return padded_seqs

en_tokenizer = Tokenizer()
en_tokenizer.fit_on_texts(en)
en_sequences = en_tokenizer.texts_to_sequences(en)
en_x = pad_sequences(en_sequences, maxlen=en_max_len, padding='post')

hing_tokenizer = Tokenizer()
hing_tokenizer.fit_on_texts(hing)
hing_sequences = hing_tokenizer.texts_to_sequences(hing)
hing_y = pad_sequences(hing_sequences, maxlen=hing_max_len, padding='post')


2024-05-08 11:22:15.618534: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-08 11:22:15.618656: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-08 11:22:15.754143: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
en_vocab_size = len(en_tokenizer.word_index) + 1
hing_vocab_size = len(hing_tokenizer.word_index) + 1

print(f'Vocabulary size (English): {en_vocab_size}')
print(f'Vocabulary size (Hinglish): {hing_vocab_size}')

Vocabulary size (English): 63518
Vocabulary size (Hinglish): 101275


In [7]:
hing_y[:,:-1]

array([[    1,   105,    89, ...,     0,     0,     0],
       [    1, 15225,     5, ...,     0,     0,     0],
       [    1,    38,  2384, ...,     0,     0,     0],
       ...,
       [    1,   123,    18, ...,     0,     0,     0],
       [    1,   109,    18, ...,     0,     0,     0],
       [    1,     8,   277, ...,     0,     0,     0]], dtype=int32)

In [8]:
inputs = { 'encoder_input': en_x, 'decoder_input': hing_y[:, :-1] }
outputs = hing_y[:, 1:]

In [9]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from keras_nlp.layers import TokenAndPositionEmbedding, TransformerEncoder
from keras_nlp.layers import TransformerDecoder

np.random.seed(42)
tf.random.set_seed(42)

num_heads = 8
embed_dim = 256

encoder_input = Input(shape=(None,), dtype='int64', name='encoder_input')
x = TokenAndPositionEmbedding(en_vocab_size, sequence_len, embed_dim)(encoder_input)
encoder_output = TransformerEncoder(embed_dim, num_heads)(x)
encoded_seq_input = Input(shape=(None, embed_dim))

decoder_input = Input(shape=(None,), dtype='int64', name='decoder_input')
x = TokenAndPositionEmbedding(hing_vocab_size, sequence_len, embed_dim, mask_zero=True)(decoder_input)
x = TransformerDecoder(embed_dim, num_heads)(x, encoded_seq_input)
x = Dropout(0.4)(x)

decoder_output = Dense(hing_vocab_size, activation='softmax')(x)
decoder = Model([decoder_input, encoded_seq_input], decoder_output)
decoder_output = decoder([decoder_input, encoder_output])

model = Model([encoder_input, decoder_input], decoder_output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary(line_length=120)

In [10]:
from tensorflow.keras.callbacks import EarlyStopping



callback = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
hist = model.fit(inputs, outputs, epochs=50, validation_split=0.2, callbacks=[callback])

Epoch 1/50




[1m   1/6209[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m30:04:56[0m 17s/step - accuracy: 5.4825e-04 - loss: 11.5222

I0000 00:00:1715167387.403329      81 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1715167387.433179      81 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m6209/6209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step - accuracy: 0.0273 - loss: 5.9769

W0000 00:00:1715168210.529096      83 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1715168212.397020      83 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m6209/6209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m918s[0m 145ms/step - accuracy: 0.0272 - loss: 5.9768 - val_accuracy: 0.0280 - val_loss: 5.1709
Epoch 2/50
[1m6209/6209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m890s[0m 143ms/step - accuracy: 0.0293 - loss: 4.9860 - val_accuracy: 0.0313 - val_loss: 4.8307
Epoch 3/50
[1m6209/6209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m891s[0m 144ms/step - accuracy: 0.0411 - loss: 4.4429 - val_accuracy: 0.0446 - val_loss: 4.6143
Epoch 4/50
[1m6209/6209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m889s[0m 143ms/step - accuracy: 0.0558 - loss: 3.9520 - val_accuracy: 0.0360 - val_loss: 4.4895
Epoch 5/50
[1m6209/6209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m887s[0m 143ms/step - accuracy: 0.0517 - loss: 3.5163 - val_accuracy: 0.0377 - val_loss: 4.3568
Epoch 6/50
[1m6209/6209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m891s[0m 143ms/step - accuracy: 0.0717 - loss: 3.1540 - val_accuracy: 0.0523 - val_loss: 4.2974
Epo


KeyboardInterrupt



In [None]:
import tensorflow.keras.backend as K
from tensorflow.keras.models import save_model

# Clear the Keras session and reset the graph
K.clear_session()

# Save the model
save_model(model, '/kaggle/working/model.h5')

In [23]:
from tqdm import tqdm

def translate_text(text, model, en_tokenizer, fr_tokenizer, fr_index_lookup, sequence_len):
    input_sequence = en_tokenizer.texts_to_sequences([text])
    padded_input_sequence = pad_sequences(input_sequence, maxlen=sequence_len, padding='post')
    decoded_text = '[start]'

    for i in range(sequence_len):
        target_sequence = fr_tokenizer.texts_to_sequences([decoded_text])
        padded_target_sequence = pad_sequences(target_sequence, maxlen=sequence_len, padding='post')[:, :-1]
        
        prediction = model([padded_input_sequence, padded_target_sequence])

        idx = np.argmax(prediction[0, i, :]) - 1
        token = fr_index_lookup[idx]
        decoded_text += ' ' + token

        if token == '[end]':
            break
    
    return decoded_text[8:-6] 
hing_vocab = hing_tokenizer.word_index
hing_index_lookup = dict(zip(range(len(hing_vocab)), hing_vocab))

df_test = pd.read_csv('/kaggle/input/eng-hing/test.txt', names=['en', 'hing'], usecols=['en', 'hing'], sep='\t')
df_test['en'] = df_test['en'].apply(lambda row: clean_text(row, language='en'))
df_test['hing'] = df_test['hing'].apply(lambda row: clean_and_prepare_text(row, language='hing'))
en_test = df_test['en']
hing_test = df_test['hing']


texts = en_test[:].values
translated = []

for text in tqdm(texts):
    translated.append(translate_text(text, model, en_tokenizer, hing_tokenizer, hing_index_lookup, sequence_len))

 40%|███▉      | 799/2000 [06:24<09:37,  2.08it/s]


InvalidArgumentError: {{function_node __wrapped__StridedSlice_device_/job:localhost/replica:0/task:0/device:GPU:0}} slice index 113 of dimension 1 out of bounds. [Op:StridedSlice] name: strided_slice/

In [25]:
with open('translated.txt', 'w', encoding='utf-8') as file:
    for translate in translated:
        file.write(translate + '\n')