In [6]:
#import the libraries
import pandas as pd
import numpy as np
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
import os

In [7]:
#load the path of the file
source_path="/content/train-source.txt"
target_path="/content/train-target.txt"

In [8]:
#this function was taken from the first challenge (sentiment analysis)
def data_preprocessing(file):
  data_file= []
  with open(file,'r',encoding='utf-8') as f:
    data = f.read()
    data = data.strip().split('</s>')
    for i in data:
      data = re.sub("\n.\n',",'.',i)
      data = re.sub('\n<s>.','',data)
      data = re.sub('\n<s>','',data)
      data = re.sub('\n',' ',data)
      data = re.sub('<s>','',data)
      data_file.append(data)
  return data_file

In [9]:
source_file=data_preprocessing(source_path)
target_file=data_preprocessing(target_path)

In [10]:
df1=pd.DataFrame({'Source': source_file})

In [11]:
df2 = pd.DataFrame({'Target': target_file})

In [12]:
df_merged = pd.concat([df1, df2], axis=1)

In [13]:
def cleaning(df):
  exclude = set(string.punctuation)
  df['Source']=df['Source'].apply(lambda x: x.lower())
  df['Target']=df['Target'].apply(lambda x: x.lower())
  df['Source']=df['Source'].apply(lambda x: re.sub("'", '', x))
  df['Target']=df['Target'].apply(lambda x: re.sub("'", '', x))
  df['Source']=df['Source'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
  df['Target']=df['Target'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
  return df

In [14]:
cleaning(df_merged).head()

Unnamed: 0,Source,Target
0,cinnte go leór thiocfadh dóbhtha bás a fhagh...,cinnte go leor thiocfadh dóibh bás a fháil a...
1,bhí sé follasach go rabh an poll sin ag fosc...,bhí sé follasach go raibh an poll sin ag fosc...
2,dfhéadfadh siad bás fhagháil ar a bhruach a...,dfhéadfadh siad bás a fháil ar a bhruach agus...
3,thiocfadh dóbhtha fosta lámh a chur ina mbás ...,thiocfadh dóibh fosta lámh a chur ina mbás fé...
4,na dhiaidh sin bhí rud éigin dochreidte agu...,ina dhiaidh sin bhí rud éigin dochreidte agus...


In [15]:
df_merged.isnull().sum()

Source    0
Target    0
dtype: int64

In [16]:
all_source_words=set()
for i in df_merged['Source']:
    for word in i.split():
        if word not in all_source_words:
            all_source_words.add(word)

In [17]:
all_target_words=set()
for j in df_merged['Target']:
    for word in j.split():
        if word not in all_target_words:
            all_target_words.add(word)

In [18]:
len(all_source_words)

29467

In [19]:
len(all_target_words)

25071

In [20]:
df_merged['Length of Source']=df_merged['Source'].apply(lambda x:len(x.split(" ")))
df_merged['Length of Target']=df_merged['Target'].apply(lambda x:len(x.split(" ")))

In [21]:
df_merged=df_merged[df_merged['Length of Source']<=30]
df_merged=df_merged[df_merged['Length of Target']<=30]

In [22]:
print("maximum length of Source Sentence ",max(df_merged['Length of Source']))
print("maximum length of Target Sentence ",max(df_merged['Length of Target']))

maximum length of Source Sentence  30
maximum length of Target Sentence  30


In [23]:
df_merged.shape

(38110, 4)

In [30]:
#this code was taken from https://www.analyticsvidhya.com/blog/2019/01/neural-machine-translation-keras/
from keras.preprocessing.text import Tokenizer
def tokenization(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

In [106]:
source_tokenizer = tokenization(df_merged.Source)
source_vocab_size = len(source_tokenizer.word_index) + 1
source_vocab_size

23673

In [32]:
source_max_length=25

In [33]:
target_tokenizer = tokenization(df_merged.Target)
target_vocab_size = len(target_tokenizer.word_index) + 1
target_max_length = 25
print('Target Vocabulary Size: %d' % target_vocab_size)

Target Vocabulary Size: 20487


In [34]:
from keras_preprocessing.sequence import pad_sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    seq = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    seq = pad_sequences(seq, maxlen=length, padding='post')
    return seq

In [103]:
from sklearn.model_selection import train_test_split

# split data into train and test set
source_train, source_test = train_test_split(df_merged.Source, test_size=0.01, random_state = 42)
target_train, target_test = train_test_split(df_merged.Target, test_size=0.01, random_state = 42)

In [105]:
# prepare training data
trainX = encode_sequences(source_tokenizer, source_max_length, source_train)
trainY = encode_sequences(target_tokenizer, target_max_length, target_train)

In [37]:
# prepare validation data
testX = encode_sequences(encoder_tokens, source_max_length, source_test)
testY = encode_sequences(target_tokenizer, target_max_length, target_test)

In [38]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
# build NMT model
def define_model(in_vocab,out_vocab, in_timesteps,out_timesteps,units):
    model = Sequential()
    model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
    model.add(LSTM(units))
    model.add(RepeatVector(out_timesteps))
    model.add(LSTM(units, return_sequences=True))
    model.add(Dense(out_vocab, activation='softmax'))
    return model

In [39]:
model = define_model(source_vocab_size, target_vocab_size, source_max_length, target_max_length, 128)

In [40]:
from keras import optimizers
rms = optimizers.RMSprop(lr=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

  super(RMSprop, self).__init__(name, **kwargs)


In [41]:
history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1),
                    epochs=1, batch_size=16, validation_split = 0.2, 
                    verbose=1)



In [42]:
predict_x=model.predict(testX.reshape((testX.shape[0],testX.shape[1])))
classes_x=np.argmax(predict_x,axis=1)



In [43]:
def get_word(n, tokenizer):
      for word, index in tokenizer.word_index.items():
          if index == n:
              return word
      return None

In [44]:
preds=np.argmax(predict_x,axis=1)

In [47]:
preds_text = []
for i in preds[:10]:
       temp = []
       for j in range(len(i)):
            t = get_word(i[j], source_tokenizer)
            if j > 0:
                if (t == get_word(i[j-1], source_tokenizer)) or (t == None):
                     temp.append('')
                else:
                     temp.append(t)
            else:
                   if(t == None):
                          temp.append('')
                   else:
                          temp.append(t) 

       preds_text.append(' '.join(temp))

In [49]:
pred_df = pd.DataFrame({'actual' : target_test[:10], 'prediction' : preds_text[:10]})
pred_df

Unnamed: 0,actual,prediction
9195,mhallaigh sé an sliabh a bhí thart air agus a...,acht a ag agus a na arsa an a go an a an...
32334,nuair a bhí mise i mo bhrian óg is iomaí be...,acht a na a bhí é bhí a “ a é a rabh ...
20172,bhí fear eile leis a chuir sé in aithne domh ...,acht a agus a ar a ar a ag an a mé a s...
3276,pléiseam ort arsa máire mhánais,le a an a a agus a a agus a an ...
32754,dimigh an séideán bradach nuair a bhí a chuid...,tá go a bhí agus a agus an ag an a bhí an ...
22544,bhí an athair thalún acu mar leigheas ar an d...,acht a ar a na a na a mé a “ a as ar ...
4174,rith anois,arsa a a a a a a a a...
9692,an ea nach raibh a dhath a dtiocfadh leis a d...,tá go an bhí agus a agus ag agus a bhí an...
27486,chonacthas taibhse an pheata bháin ar chnoc m...,sí sé a bhí sé a na go ag a sé a go a a...
38375,dar fia na saighdiúirí arsa mise,“ a agus a an a na a a a agus a ...


In [52]:
#load the path of the file
source_path_test="/content/test-source.txt"
target_path_test="/content/test-target.txt"

In [53]:
source_file_test=data_preprocessing(source_path_test)
target_file_test=data_preprocessing(target_path_test)

In [54]:
df3 = pd.DataFrame({'Source': source_file_test})
df3.head()

Unnamed: 0,Source
0,Scéal Chathail Freeman - Téid mo Dhearbhrátha...
1,"MÍ Iúil a bhí ann i mbliadhain a 1854 , nuair..."
2,"An dearbhráthair a ba sine agam , Seán Freema..."
3,"Tráthnóna breágh amháin , chuaidh sé a dh'ias..."
4,Bhí sé go maith i n-a shláinte agus nuair a b...


In [55]:
df4 = pd.DataFrame({'Target': target_file_test})
df4.head()

Unnamed: 0,Target
0,Scéal Chathail Freeman - Téann mo dheartháir ...
1,"Mí Iúil a bhí ann i mbliain a 1854 , nuair a ..."
2,"An deartháir ba sine agam , Sean Freeman , tu..."
3,"Tráthnóna breá amháin , chuaigh sé a dh'iasca..."
4,Bhí sé go maith ina shláinte agus nuair a bhí...


In [56]:
df_merged1= pd.concat([df3, df4], axis=1)
df_merged1

Unnamed: 0,Source,Target
0,Scéal Chathail Freeman - Téid mo Dhearbhrátha...,Scéal Chathail Freeman - Téann mo dheartháir ...
1,"MÍ Iúil a bhí ann i mbliadhain a 1854 , nuair...","Mí Iúil a bhí ann i mbliain a 1854 , nuair a ..."
2,"An dearbhráthair a ba sine agam , Seán Freema...","An deartháir ba sine agam , Sean Freeman , tu..."
3,"Tráthnóna breágh amháin , chuaidh sé a dh'ias...","Tráthnóna breá amháin , chuaigh sé a dh'iasca..."
4,Bhí sé go maith i n-a shláinte agus nuair a b...,Bhí sé go maith ina shláinte agus nuair a bhí...
...,...,...
996,Chonnaic muid fosta caidé mar chuaidh an dul ...,Chonaic muid fosta cad é mar a chuaigh an dol...
997,Fuair muid comharthaí go leór go rabh na dear...,Fuair muid comharthaí go leor go raibh na dea...
998,Chuartuigh muid an uaimh mhór arís agus arís ...,Chuartaigh muid an uaimh mhór arís agus arís ...
999,Bhíomar ag cuartughadh agus ag breath - nugha...,Bhí muid ag cuartú agus ag breathnú ní ba ghr...


In [57]:
cleaning(df_merged1).head()

Unnamed: 0,Source,Target
0,scéal chathail freeman téid mo dhearbhráthai...,scéal chathail freeman téann mo dheartháir c...
1,mí iúil a bhí ann i mbliadhain a 1854 nuair ...,mí iúil a bhí ann i mbliain a 1854 nuair a b...
2,an dearbhráthair a ba sine agam seán freeman...,an deartháir ba sine agam sean freeman tuga...
3,tráthnóna breágh amháin chuaidh sé a dhiasca...,tráthnóna breá amháin chuaigh sé a dhiascair...
4,bhí sé go maith i na shláinte agus nuair a bh...,bhí sé go maith ina shláinte agus nuair a bhí...


In [58]:
source_tokenizer_test = tokenization(df_merged1.Source)
source_vocab_size_test = len(source_tokenizer_test.word_index) + 1
source_vocab_size_test

3188

In [59]:
source_max_length_test=30

In [101]:
target_tokenizer_test= tokenization(df_merged1.Target)
target_vocab_size_test = len(target_tokenizer_test.word_index) + 1
target_max_length_test = 30
print('Target Vocabulary Size: %d' % target_vocab_size_test)

Target Vocabulary Size: 2988


In [61]:
source_test1 = encode_sequences(source_tokenizer_test, source_max_length_test, df_merged1.Source)
target_test1 = encode_sequences(target_tokenizer_test, target_max_length_test, df_merged1.Target)

In [62]:
predict_test=model.predict(target_test1.reshape((target_test1.shape[0],target_test1.shape[1])))
classes_test=np.argmax(predict_test,axis=1)



In [63]:
preds_test=np.argmax(predict_test,axis=1)

In [88]:
preds_text_test = []
for i in preds_test[:10]:
       temp1 = []
       for j in range(len(i)):
            t = get_word(i[j], target_tokenizer_test)
            if j > 0:
                if (t == get_word(i[j-1], target_tokenizer_test)) or (t == None):
                     temp1.append('')
                else:
                     temp1.append(t)
            else:
                   if(t == None):
                          temp1.append('')
                   else:
                          temp1.append(t) 

       preds_text_test.append(' '.join(temp1))

In [89]:
target_test1.reshape((target_test1.shape[0],target_test1.shape[1])).shape

(1001, 25)

In [90]:
preds_new=np.array(preds_text_test)

In [98]:
import nltk

hypothesis = list(df_merged1.Source)
reference = list(df_merged1.Target)
BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis, weights = [0.1])
print(BLEUscore)

0.7410603763959202
