In [1]:
import os
import re
import warnings
import numpy as np
import pandas as pd
import pickle

from nltk import ngrams
from sklearn.model_selection import train_test_split

In [2]:

url_regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
hashtag_regex = r"#(\w+)"
mention_regex = r"\B@(?!(?:[a-z0-9.]*_){2})(?!(?:[a-z0-9_]*\.){2})[._a-z0-9]{3,24}\b"
start = ['<pad>','<pad>','<pad>','<pad>']
# end = np.array(['<pad>','<pad>','<pad>'])
punc = [',', '.', '!', ';', '?']
def tokenizer(messages):
    tokenized_data = []
#     for messages in data:
    messages = messages.lower()
#         messages = re.sub(url_regex,'<URL>',messages)
#         messages = re.sub(hashtag_regex,'<HASHTAG>',messages)
#         messages = re.sub(mention_regex, '<MENTION>',messages)
#         newmessage = ''
#         for i in range(len(messages)):
#             if re.match(r'[^\w\s]', messages[i]) and messages[i]!='<' and messages[i]!='>':
#                 newmessage+=' '+messages[i]+' '
#             else:
#                 newmessage+=messages[i]

#         if newmessage == '':
#             continue
    messages = re.sub(r'[^\w\s]','',messages)
    words = re.split('\s|\t|\n', messages)
    nowo = []
    for i in words:
        if i!='':
            nowo.append(i)
#         if len(words)==0:
#             continue
#         m = len(words)
#         for i  in range(m):
#             if len(words[i])!=1:
#                 continue
#             if words[i] in punc and words[i] not in ['s','e','<URL>','<HASHTAG>','<MENTION>']:
#                 words[i] = re.sub(r'[^\w\s]','',words[i])
#         words = words[words!='']
#         for wo in words:
#             tokenized_data.append(wo)
        #tokenized_data.append(np.concatenate((start, words, end)))
    return nowo


In [4]:
def read_text(path):
    """ Function to read input data
    Args:
        path (string): the parent path of the folder containing the input text files
    Returns:
        string: The complete text read from input files appended in a single string.
    """
    text = ' '
    f = open(path, 'r')
    text = f.readlines()
    return text


def preprocess_text(text):
    """ Function for basic cleaning and pre-processing of input text
    Args:
        text (string): raw input text
    Returns:
        string: cleaned text
    """
    data = []
    for line in text:
        data.append(tokenizer(line))
    return data
#     text = text.lower()
#     text = re.sub(r"'s\b", "", text)
#     text = re.sub("[^a-zA-Z]", " ", text)
#     text = ' '.join([word for word in text.split() if len(word) >= 3]).strip()

#     return text


def prepare_text(data, n, word_id):
    """ Function to prepare text in sequence of ngrams
    Args:
        text (string): complete input text
    Returns:
        list : a list of text sequence with 31 characters each
    """
    new_data = []
    for i in range(len(data)):
        x = len(data[i])
        gram = []
        gram.append(2)
        for j in range(x):
            gram.append(word_id[data[i][j]])
        for j in range(n-x-2):
            gram.append(1)
        gram.append(3)
        new_data.append(gram)
    return new_data

def create_data(data, word_id, id_word):
    """ Function to encode the character sequence into number sequence
    Args:
        text (string): cleaned text
        sequence (list): character sequence list
    Returns:
        dict: dictionary mapping of all unique input charcters to integers
        list: number encoded charachter sequences
    """
    x = []
    y = []
    for i in data:
        inp = []
        for j in range(4):
            if i[j] not in word_id.keys():
                inp.append(word_id['<UNKNOWN>'])
            else:
                inp.append(word_id[i[j]])
        if i[4] not in word_id.keys():
            y.append(word_id['<UNKNOWN>'])
        else:
            y.append(word_id[i[4]])
        x.append(inp)
    x = np.array(x)
    y = np.array(y)
    return x,y
    
    


def split_data(mapping, encoded_sequence):
    """ Function to split the prepared data in train and test
    Args:
        mapping (dict): dictionary mapping of all unique input charcters to integers
        encoded_sequence (list): number encoded charachter sequences
    Returns:
        numpy array : train and test split numpy arrays
    """

    encoded_sequence_ = np.array(encoded_sequence)
    X, y = encoded_sequence_[:, :-1], encoded_sequence_[:, -1]
    y = to_categorical(y, num_classes=len(mapping))
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, random_state=42)
    return X_train, X_test, y_train, y_test

def create_vocab(data, threshold =  5):
    freq = {}
    for j in data:
        for i in j:
            if i not in freq.keys():
                freq[i] = 0
            freq[i]+=1
    for i in range(len(data)):
        for j in range(len(data[i])):
            if freq[data[i][j]] <= threshold:
                data[i][j] = '<UNKNOWN>'
    word_id = {}
    id_word = {}
    word_id['<UNKNOWN>'] = 0
    word_id['<START>'] = 2
    word_id['<END>'] = 3
    word_id['<PAD>'] = 1
    id_word[0] = '<UNKNOWN>'
    id_word[1] = '<PAD>'
    id_word[2] = '<START>'
    id_word[3] = '<END>'
    cur = 4
    for j in data:
        for i in j:
            if i not in word_id.keys():
                word_id[i] = cur
                id_word[cur] = i
                cur+=1
    return data, word_id, id_word

In [5]:
text_en = read_text('./dataset/ted-talks-corpus/train.en')
text_fr = read_text('./dataset/ted-talks-corpus/train.fr')

In [6]:
# text_en_test = read_text('./dataset/ted-talks-corpus/test.en')
# text_fr_test = read_text('./dataset/ted-talks-corpus/test.fr')

In [7]:
data_en = preprocess_text(text_en)
data_fr = preprocess_text(text_fr)

In [8]:
mx_en = 40
mx_fr = 40
co = np.zeros(500)
co1 = np.zeros(500)
da_en = []
da_fr = []
sen_id = []
for i in range(len(data_en)):
    if len(data_en[i])<=38 and len(data_fr[i])<=38:
        da_en.append(data_en[i])
        da_fr.append(data_fr[i])
        sen_id.append(i)
len(da_en)

In [9]:
da_en , word_id_en , id_word_en = create_vocab(da_en,10)
da_fr , word_id_fr , id_word_fr = create_vocab(da_fr,10)

In [10]:
en_seq = np.array(prepare_text(da_en, mx_en, word_id_en))
fr_seq = np.array(prepare_text(da_fr, mx_fr, word_id_fr))

In [11]:
# data_test_en = preprocess_text(text_en_test)
# data_test_fr = preprocess_text(text_fr_test)

In [12]:
def prepare_test(line, word_id):
    mess = []
    for wo in line:
        if wo in word_id.keys():
            mess.append(wo)
        else:
            mess.append('<UNKNOWN>')
    return mess
        
sen_id_test = []
test_data = []
da_en_test = []
da_fr_test = []
for i in range(len(text_en_test)):
    if len(data_test_en[i])<=38 and len(data_test_fr[i])<=38:
        sen_id_test.append(i)
        da_en_test.append(prepare_test(data_test_en[i], word_id_en))
        da_fr_test.append(prepare_test(data_test_fr[i], word_id_fr))
en = np.array(prepare_text(da_en_test, 40, word_id_en))
fr = np.array(prepare_text(da_fr_test, 40, word_id_fr))

for i in range(len(da_en_test)):
    test_data.append((en[i], fr[i]))

In [13]:
en_vocab_size = len(word_id_en)
fr_vocab_size = len(word_id_fr)
embedding_size = 50
en_vocab_size, fr_vocab_size

In [14]:
train_data = []
for i in range(len(en_seq)):
    train_data.append((en_seq[i],fr_seq[i]))

In [15]:
# import libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
from torchvision import datasets, transforms
import cv2
import matplotlib.pyplot as plt
import numpy as np 

%matplotlib inline

# for reproducibility
torch.manual_seed(1)

In [16]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [17]:
import torch.nn as nn
import torch.nn.init as init

class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, n_layers=1):
        super(EncoderRNN, self).__init__()

        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        self.embedding = nn.Embedding(vocab_size, hidden_size)
#         init.normal_(self.embedding.weight, 0.0, 0.2)
        self.si2 = self.hidden_size//2
        print(self.si2)
        self.lstm = nn.LSTM(input_size=self.hidden_size,hidden_size=self.si2,num_layers=self.n_layers,batch_first=True,bidirectional=True)

    def forward(self, word_inputs, hidden):         
        embedded = self.embedding(word_inputs)
        
        output, hidden = self.lstm(embedded, hidden)
        return output, hidden

    def init_hidden(self, batches):
        hidden = (torch.randn(2*self.n_layers, batches, self.hidden_size//2).to(device),torch.randn(2*self.n_layers, batches, self.hidden_size//2).to(device))
        return hidden


class DecoderRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, n_layers=1):
        super(DecoderRNN, self).__init__()

        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        self.embedding = nn.Embedding(vocab_size, hidden_size)
#         init.normal_(self.embedding.weight, 0.0, 0.2)

        self.lstm = nn.LSTM(input_size = self.hidden_size,hidden_size=self.hidden_size,num_layers=self.n_layers,batch_first=True,bidirectional=False)

    def forward(self, word_inputs, hidden):
        embedded = self.embedding(word_inputs).unsqueeze_(1)
        output, hidden = self.lstm(embedded, hidden)
        return output, hidden
    


In [18]:
class Seq2seq(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, hidden_size, n_layers):
        super(Seq2seq, self).__init__()

        self.n_layers = n_layers
        self.hidden_size = hidden_size

        self.encoder = EncoderRNN(input_vocab_size, hidden_size, self.n_layers)
        self.decoder = DecoderRNN(output_vocab_size, hidden_size, self.n_layers)

        self.W = nn.Linear(hidden_size, output_vocab_size)
#         init.normal_(self.W.weight, 0.0, 0.2)

        self.softmax = nn.Softmax()

    def _forward_encoder(self, x):
        batch_size = x.shape[0]
        init_hidden = self.encoder.init_hidden(batch_size)
        encoder_outputs, encoder_hidden = self.encoder(x, init_hidden)
        encoder_hidden_h, encoder_hidden_c = encoder_hidden

        self.decoder_hidden_h = encoder_hidden_h.permute(1,0,2).reshape(batch_size, self.n_layers, self.hidden_size).permute(1,0,2)
        self.decoder_hidden_c = encoder_hidden_c.permute(1,0,2).reshape(batch_size, self.n_layers, self.hidden_size).permute(1,0,2)
        return self.decoder_hidden_h, self.decoder_hidden_c

    def forward(self, x, y):
        decoder_hidden_h, decoder_hidden_c = self._forward_encoder(x)

        H = []
        for i in range(y.shape[1]):
            input = y[:, i]
            decoder_output, decoder_hidden = self.decoder(input, (decoder_hidden_h, decoder_hidden_c))
            decoder_hidden_h, decoder_hidden_c = decoder_hidden
            # h: (batch_size, vocab_size)
            h = self.W(decoder_output.squeeze(1))
            # h: (batch_size, vocab_size, 1)
            H.append(h.unsqueeze(2))

        # H: (batch_size, vocab_size, seq_len)
        return torch.cat(H, dim=2)
    
    def forward2(self, x):
        decoder_hidden_h, decoder_hidden_c = self._forward_encoder(x)

        current_y = 2
        result = []
        counter = 0
        while current_y!=3 and counter < 40:
            input = torch.tensor([current_y]).to(device)
            decoder_output, decoder_hidden = self.decoder(input, (decoder_hidden_h, decoder_hidden_c))
            decoder_hidden_h, decoder_hidden_c = decoder_hidden
            # h: (vocab_size)
            h = self.W(decoder_output.squeeze(1)).squeeze(0)
#             print(h.shape)
            y = F.softmax(h, dim = 0)
            _, current_y = torch.max(y, dim=0)
            current_y = current_y.item()
#             print(current_y)
            if current_y != 1:
                result.append(id_word_fr[current_y])
            counter += 1

        return result



In [19]:
from nltk.translate.bleu_score import sentence_bleu
def bleuScore(model, train_data, batch=1):
    train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch, shuffle=False) 
    scores = []
    translations = []
    with torch.no_grad():
        model.eval()
        count = 0
        for imgs, labels in train_loader:
            sen2 = model.forward2(imgs.to(device))
            translations.append(sen2)
            sen1 = da_fr[count]
            count+=1
            score= 0.0
            if len(sen1)<4 or len(sen2)<4:
                score = sentence_bleu([sen1],sen2,weights=(0.5,0.5))
            else:
                score = sentence_bleu([sen1],sen2)
            scores.append(score)
#             print(sen1, sen2, score)
    return scores, translations

In [20]:
import random
def trainvalAdam(model, train_data, device, batch_size=100, num_iters=1, learn_rate=0.01):
    yo = train_data.copy()
    random.shuffle(yo)
    train_loader = torch.utils.data.DataLoader(yo[:25000], batch_size=batch_size, shuffle=True) # shuffle after every epoch
    criterion = nn.CrossEntropyLoss()
    # optimizer = optim.SGD(model.parameters(), lr=learn_rate, momentum=0.9)
    optimizer = optim.Adam(model.parameters(), lr=0.01, betas=(0.90, 0.98), eps=1e-08, weight_decay=0, amsgrad=False)
    # optimizer = optim.Adadelta(model.parameters(),lr=1.0, rho=0.95, eps=1e-08, weight_decay=0.0)
    # training
    losses = []
    avg_scores = []
    prev = 0
    best_score = -1e9
    si = len(train_data)
    n = 0 # the number of iterations
    for n in tqdm(range(num_iters)):
        tot_loss = 0.0
        for imgs, labels in train_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            # optimizer.zero_grad()         # a clean up step for PyTorch
            out = model.forward(imgs,labels)             # forward pass
#             print(out.shape)
#             print(labels.shape)
            loss = criterion(out[:,:,1:39], labels[:,1:39]) # compute the total loss
#             print(loss)
            tot_loss += loss
            optimizer.zero_grad()
            loss.backward()               # backward pass (compute parameter updates)
            optimizer.step()  # make the updates for each parameter
        losses.append(tot_loss/si)
        scores= np.array(bleuScore(model , yo[25000:]))
        score = scores.sum()/len(scores)
        avg_scores.append(score)
        print('epoch '+str(n)+' = '+str(tot_loss/si)+'  and  score is: '+str(score))
        if score>best_score:
            best_score = score
            prev = n
        if n-prev>10:
            break
    
    return losses

In [None]:
# model = Seq2seq(en_vocab_size, fr_vocab_size, embedding_size, 1)
# print(model)
# # device = "cuda" if torch.cuda.is_available() else "cpu"
# # device = "cpu"
# print(device)
# model.to(device)
# train_losses = trainvalAdam(model, train_data, device, num_iters=1000)

In [None]:
# ep = [i for i in range(1,14)]
# loss = []
# for i in train_losses:
#     loss.append(float(i))
# plt.plot(ep,loss)

In [None]:
# score = [0.01914861144514335,  0.12519344714329764,  0.12519344714329764,0.12519344714329764,0.12503745053114246,0.12266798567987887, 0.12266798567987887,0.12251198906772368, 0.12251198906772368, 0.12251198906772368, 0.12251198906772368, 0.12251198906772368, 0.12251198906772368]
# plt.plot(ep, score)

In [21]:
model = Seq2seq(en_vocab_size, fr_vocab_size, embedding_size, 1)
model.to(device)
model.load_state_dict(torch.load('./models/model_translate_1'))

In [None]:
# torch.save(model.state_dict(),'./model_translate_1')

In [22]:
# test_scores = bleuScore(model, test_data)

In [23]:
# train_scores = bleuScore(model, train_data)

In [None]:
# avg_score = np.array(train_scores).sum()/len(train_scores)
# f = open('train-translation.txt','w')
# f.write('Average bleu score = '+str(avg_score)+'\n')
# for i in range(len(sen_id)):
#     f.write(text_en[sen_id[i]]+'     '+str(float(train_scores[i]))+'\n')

In [None]:
# avg_score = np.array(test_scores).sum()/len(test_scores)
# f = open('test-translation.txt','w')
# f.write('Average bleu score = '+str(avg_score)+'\n')
# for i in range(len(sen_id_test)):
#     f.write(text_en_test[sen_id_test[i]]+'     '+str(float(test_scores[i]))+'\n')

In [None]:
sent = input('Give english sentence as input: \n')
text_en_test = [sent]
data_test_en = preprocess_text(text_en_test)

def prepare_test(line, word_id):
    mess = []
    for wo in line:
        if wo in word_id.keys():
            mess.append(wo)
        else:
            mess.append('<UNKNOWN>')
    return mess
        
da_en_test = []
for i in range(len(text_en_test)):
    if len(data_test_en[i])<=38 and len(data_test_fr[i])<=38:
        da_en_test.append(prepare_test(data_test_en[i], word_id_en))
        
en = torch.tensor(prepare_text(da_en_test, 40, word_id_en)).to(device)

with torch.no_grad():
    model.eval()
    sen = model.forward2(en[0])
    print('output translation is: \n',sen)