## Machine Translation

### Generate translation embedding and calculate optimal transform matrices R with gradient loss.  
 (Translation as linear transformation of embeddings)

In [29]:
import pickle
import gensim
import pdb
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizer
from gensim.models import KeyedVectors
from nltk.stem.porter import *

In [11]:
# get the positive and negative tweets
import nltk
nltk.download('twitter_samples')
nltk.download('stopwords')
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
all_tweets = all_positive_tweets + all_negative_tweets

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\Sealion\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sealion\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Data Cleaning
def process_tweet(tweet):
    # tweet: a string containing a tweet
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)
    return tweets_clean

In [12]:
def get_document_embedding(tweet, en_embeddings): 
    doc_embedding = np.zeros(300)
    # process the document into a list of words (process the tweet)
    processed_doc = process_tweet(tweet) 
    for word in processed_doc:
        doc_embedding += en_embeddings.get(word, 0)
    return doc_embedding

In [25]:
def cosine_similarity(A, B):
    #  A and B: a numpy array which corresponds to a word vector
   
    dot = np.dot(A,B)
    norma = np.sqrt(np.dot(A,A))
    normb = np.sqrt(np.dot(B,B)) 
    cos = dot / (norma * normb)
    return cos

In [26]:
def get_dict(file_name):
    my_file = pd.read_csv(file_name, delimiter=' ')
    etof = {}  # the english to french dictionary to be returned
    for i in range(len(my_file)):
        en = my_file.loc[i][0]
        fr = my_file.loc[i][1]
        etof[en] = fr

    return etof

In [None]:
en_embeddings_subset = pickle.load(open("en_embeddings.p", "rb"))   # subset of English
fr_embeddings_subset = pickle.load(open("fr_embeddings.p", "rb"))   # subset of French

In [41]:
# Test
custom_tweet = "RT @Twitter @chapagain Hello we are there! Have a great day, bye bye. :) #good #morning http://chapagain.com.np"
tweet_embedding = get_document_embedding(custom_tweet, en_embeddings_subset)
tweet_embedding[-5:]

array([-0.00268555, -0.15378189, -0.55761719, -0.07216644, -0.32263184])

In [30]:
en_fr_train = get_dict('en-fr.train.txt')           # English to French training dictionary
print('The length of the English to French training dictionary is', len(en_fr_train))
en_fr_test = get_dict('en-fr.test.txt')             # English to French test dictionary
print('The length of the English to French test dictionary is', len(en_fr_train))

The length of the English to French training dictionary is 5000
The length of the English to French test dictionary is 5000


In [31]:
# Iterate over English owrds in english_france dictionary, check if both en_fr embedding.
def get_matrices(en_fr, french_vecs, english_vecs):

    # X_l and Y_l are lists of the english and french word embeddings
    X_l = list()
    Y_l = list()
    english_set = english_vecs.keys()
    french_set = french_vecs.keys()
    french_words = set(en_fr.values())
    for en_word, fr_word in en_fr.items():
        if fr_word in french_set and en_word in english_set:
            en_vec = english_vecs[en_word]
            fr_vec = french_vecs[fr_word]
            X_l.append(en_vec)
            Y_l.append(fr_vec)
    X = np.vstack(X_l)
    Y = np.vstack(Y_l)
    return X, Y

X_train, Y_train = get_matrices(en_fr_train, fr_embeddings_subset, en_embeddings_subset)

In [32]:
def compute_loss(X, Y, R):
    m = X.shape[0]    
    # diff is XR - Y
    diff = np.dot(X, R) - Y
    diff_squared = diff**2
    sum_diff_squared = np.sum(diff_squared)
    loss = sum_diff_squared/m
    return loss

In [33]:
def compute_gradient(X, Y, R):
    m = X.shape[0]
    # gradient is X^T(XR - Y) * 2/m
    gradient = np.dot(X.transpose(), np.dot(X, R) - Y) * (2/m)
    return gradient

In [34]:
def align_embeddings(X, Y, train_steps=100, learning_rate=0.0003):
    np.random.seed(100)
    R = np.random.rand(X.shape[1], X.shape[1])
    for i in range(train_steps):
        if i % 25 == 0:
            print(f"loss at iteration {i} is: {compute_loss(X, Y, R):.4f}")
        gradient = compute_gradient(X, Y, R)
        # update R by subtracting the learning rate times gradient
        R -= learning_rate * gradient
        ### END CODE HERE ###
    return R

In [35]:
# For optimal transform matrices R 
np.random.seed(100)
m = 10
n = 5
X = np.random.rand(m, n)
Y = np.random.rand(m, n) * .1
R = align_embeddings(X, Y)
R_train = align_embeddings(X_train, Y_train, train_steps=400, learning_rate=0.8)
print(R)

loss at iteration 0 is: 5.1274
loss at iteration 25 is: 4.9449
loss at iteration 50 is: 4.7690
loss at iteration 75 is: 4.5996
loss at iteration 0 is: 963.0146
loss at iteration 25 is: 97.8292
loss at iteration 50 is: 26.8329
loss at iteration 75 is: 9.7893
loss at iteration 100 is: 4.3776
loss at iteration 125 is: 2.3281
loss at iteration 150 is: 1.4480
loss at iteration 175 is: 1.0338
loss at iteration 200 is: 0.8251
loss at iteration 225 is: 0.7145
loss at iteration 250 is: 0.6534
loss at iteration 275 is: 0.6185
loss at iteration 300 is: 0.5981
loss at iteration 325 is: 0.5858
loss at iteration 350 is: 0.5782
loss at iteration 375 is: 0.5735
[[ 0.53186378  0.22206248  0.57739413  0.06213423  0.93497411]
 [ 0.47455725  0.83983754  0.07033412  0.06638199 -0.01756795]
 [ 0.75933673 -0.00236315  0.1478196   0.41119182  0.21021122]
 [ 0.14998785  0.83126414  0.31291444  0.78347833  0.4329525 ]
 [ 0.35438161  0.62413541  0.42746515  0.15516712  0.47377831]]


In [36]:
# Define cosine similarity
def nearest_neighbor(v, candidates, k=1):
    similarity_l = []
    for row in candidates:
        # get the cosine similarity
        cos_similarity = cosine_similarity(v, row)
        similarity_l.append(cos_similarity)
    sorted_ids = np.argsort(similarity_l)
    k_idx = sorted_ids[-k:]
    return k_idx

In [38]:
# Accuracy
def test_vocabulary(X, Y, R):
    # The prediction is X times R
    pred = np.dot(X,R)
    # initialize 
    num_correct = 0
    for i in range(len(pred)):
        pred_idx = nearest_neighbor(pred[i], Y)
        if pred_idx == i:
            num_correct += 1
    accuracy = num_correct / len(pred)
    return accuracy

In [40]:
# Test the translation
X_val, Y_val = get_matrices(en_fr_test, fr_embeddings_subset, en_embeddings_subset)
acc = test_vocabulary(X_val, Y_val, R_train)  # this might take a minute or two
print(f"accuracy on test set is {acc:.2f}")

accuracy on test set is 0.56
