Imports

In [1]:
import torch
import torch.nn as nn

import pickle
import numpy as np
import math
import matplotlib.pyplot as plt
import random
import unicodedata
import re
import torch.cuda

from sklearn.decomposition import PCA
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

from scipy.spatial import distance

In [2]:
%cd drive/MyDrive/IndividualProject/

/content/drive/MyDrive/IndividualProject


# Getting the Translation Files

In [4]:
en_vocabs = []
fr_vocabs = []
de_vocabs = []
chi_vocabs = []
en_weights = []
fr_weights = []
de_weights = []
chi_weights = []

for year in range(1800, 2000, 10):
  path = "ProjectSoftwareArchive/"
  vocab_filename = str(year) + "-vocab.pkl";
  weights_filename = str(year) + "-w.npy"
  en_vocabs.append(pickle.load(open(path + "en-embs/" + vocab_filename, 'rb')))
  fr_vocabs.append(pickle.load(open(path + "fr-embs/" + vocab_filename, 'rb')))
  de_vocabs.append(pickle.load(open(path + "de-embs/" + vocab_filename, 'rb')))
  en_weights.append(np.load(path + "en-embs/" + weights_filename))
  fr_weights.append(np.load(path + "fr-embs/" + weights_filename))
  de_weights.append(np.load(path + "de-embs/" + weights_filename))
  if (year >= 1950):
    chi_vocabs.append(pickle.load(open(path + "chi-embs/" + vocab_filename, 'rb')))
    chi_weights.append(np.load(path + "chi-embs/" + weights_filename))
  else:
    chi_vocabs.append([0])
    chi_weights.append([0])

# vocabs array : 0 for English, 1 for French, 2 for German, 3 for Chinese
# same Hash Map keys are for the weights
vocabs = []
weights = []
vocabs.append(en_vocabs)
vocabs.append(fr_vocabs)
vocabs.append(de_vocabs)
vocabs.append(chi_vocabs)
weights.append(en_weights)
weights.append(fr_weights)
weights.append(de_weights)
weights.append(chi_weights)

# Training and obtaining results from the Word-level translator

In [None]:
def index2year(idx):
  return 1800 + idx * 10

def year2index(year):
  return (year - 1800)/10

lang2index = {"English": 0, "French": 1, "German": 2, "Mandarin": 3}

# word - str, input_lang - str, input_year - int, output_lang - str, output_year - int
def word2embedding(word, lang_idx, year_idx):
  return weights[lang_idx][year_idx][vocabs[lang_idx][year_idx].
                                             index(word)]

def translate(word, input_lang, input_year, output_lang, output_year, vars):
  # Autotranslation works just for English.
  #   If there is autotranslation, I change the output language temporarily so that
  # the training file exists.
  autotranslation = False
  list_name = "swadesh list"
  if (input_lang == 'English' and output_lang == 'English'):
    output_lang = 'French'
    autotranslation = True
  training_file = input_lang + '-' + output_lang + " " + list_name
  
  lines = open(training_file).read().strip().split('\n')
  print(lines)

  input_swadesh_words = []
  output_swadesh_words = []
  for line in lines:
    pair = re.split(' +', line)
    input_swadesh_words.append(pair[0].lower())
    output_swadesh_words.append(pair[1].lower())

  if autotranslation == True:
    output_lang = 'English'
    output_swadesh_words = input_swadesh_words

  # I am training the model on each input for the given planes
  input_embs = []
  output_embs = []
  input_lang_idx = lang2index[input_lang]
  input_year_idx = int(year2index(input_year))
  output_lang_idx = lang2index[output_lang]
  output_year_idx = int(year2index(output_year))

  for i in input_swadesh_words:
    print(i, end =", ")
  #print(output_swadesh_words)
  
  for idx in range(len(input_swadesh_words)):
    train_word1 = input_swadesh_words[idx]
    train_word2 = output_swadesh_words[idx]
    # Checking if the word is in the vocabularies for the given year
    if vocabs[input_lang_idx][input_year_idx].count(train_word1) > 0 \
    and vocabs[output_lang_idx][output_year_idx].count(train_word2) > 0:
      input_embs.append(word2embedding(train_word1, input_lang_idx,
                                       input_year_idx))
      output_embs.append(word2embedding(train_word2, output_lang_idx,
                                        output_year_idx))
      
  model = LinearRegression()
  model.fit(input_embs, output_embs)
  x_hat = word2embedding(word, input_lang_idx, input_year_idx)
  y_hat = model.predict(x_hat.reshape(1,-1))
  output_weights_all = weights[output_lang_idx][output_year_idx]

  dist = np.linalg.norm(y_hat - output_weights_all, axis=1) # Euclidean distance
  indices = dist.argsort()[:vars] # Get the first "vars" elements sorted by minimal distance
  for i in indices:
    print(vocabs[output_lang_idx][output_year_idx][i]) # Get vector having minimum distance
  print("First argument in the following tuples is the size of the Swadesh list:")
  print(np.array(input_embs).shape)
  print(np.array(output_embs).shape)

# Arguments: word to translate, input language and year of it, output language
# and year of it, and the number of translating options
translate('queen', 'English', 1990, 'French', 1990, vars=10)

# Volatility of a Word

Code to measure distance at the closest 5 words at time t, which is subtracted from the same closest words at time t' (only in English)

In [None]:
def volatility(word, from_year, to_year):
  # Word Volatility is defined for English, which has the index 0
  lang_idx = 0
  input_year_idx = int(year2index(from_year))
  output_year_idx = int(year2index(to_year))
  input_vocab = vocabs[lang_idx][input_year_idx]
  output_vocab = vocabs[lang_idx][output_year_idx]
  input_weights = weights[lang_idx][int(year2index(from_year))]
  output_weights = weights[lang_idx][int(year2index(to_year))]

  if not (input_vocab.count(word) > 0 and output_vocab.count(word) > 0):
    print("Word does not exist in the vocabularies of the given years")
    return

  word_input_embeddings = word2embedding(word, lang_idx, input_year_idx)
  word_output_embeddings = word2embedding(word, lang_idx, output_year_idx)

  dists_at_input_year = np.linalg.norm(word_input_embeddings - input_weights, axis=1) # Euclidean distance
  indices = dists_at_input_year.argsort()[1:17] # Get the first "vars" elements sorted by minimal distance
  key_words_from_input = []
  key_words_input_dist = []
  for i in indices:
    curr = vocabs[lang_idx][input_year_idx][i]
    if output_vocab.count(curr):
      key_words_from_input.append(curr)
      key_words_input_dist.append(dists_at_input_year[i])
  key_words_output_dist = []
  for key_word in key_words_from_input:
    key_words_output_dist.append(np.linalg.norm(
          word_output_embeddings - word2embedding(key_word, lang_idx, output_year_idx)))
      
  sum = 0
  for idx, key_word in enumerate(key_words_from_input):
    sum += ((key_words_output_dist[idx] - key_words_input_dist[idx])**2)/(math.log(idx + 2, 2))

  dists_at_output_year = np.linalg.norm(
      word_output_embeddings - output_weights, axis=1) # Euclidean distance
  indices = dists_at_output_year.argsort()[1:17] 
  # Get the first "vars" elements sorted by minimal distance
  # The word at index 0 is the same word as the input one, so it is redundant
  key_words_from_output = []
  key_words_output_dist = []
  for i in indices:
    curr = vocabs[lang_idx][output_year_idx][i]
    if input_vocab.count(curr):
      key_words_from_output.append(curr)
      key_words_output_dist.append(dists_at_output_year[i])
  key_words_input_dist = []
  for key_word in key_words_from_output:
    key_words_input_dist.append(np.linalg.norm(
          word_input_embeddings - word2embedding(key_word, lang_idx, input_year_idx)))

  for idx, key_word in enumerate(key_words_from_output):
    sum += ((key_words_input_dist[idx] - key_words_output_dist[idx])**2)/(math.log(idx + 2, 2))

  return sum

volatility("yesterday", 1900, 1990)

0.4003451419459667

Calculating the k least volatile words in a time period - optional extension

In [None]:
def calculate_volatilities(from_year, to_year, k):
  lang_idx = 0
  input_year_idx = int(year2index(from_year))
  output_year_idx = int(year2index(to_year))
  input_vocab = vocabs[lang_idx][input_year_idx]
  output_vocab = vocabs[lang_idx][output_year_idx]
  common_vocab = [word for word in input_vocab if word in output_vocab]

  volatilities = [volatility(word, from_year, to_year) for word in common_vocab].cuda()

  return volatilities, common_vocab

volatilities, common_vocab = calculate_volatilities(1850, 1990, 68).cuda()
print(volatilities)