<a href="https://colab.research.google.com/github/suryasuresh91/Lexical-Complexity-Prediction/blob/main/LCP_Glove.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install spacy-syllables
!python -m spacy download en_core_web_sm
!pip3 install wordfreq


Collecting spacy-syllables
  Downloading spacy_syllables-3.0.1-py3-none-any.whl (6.6 kB)
Collecting spacy<4.0.0,>=3.0.3
  Downloading spacy-3.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[K     |████████████████████████████████| 6.0 MB 34.0 MB/s 
[?25hCollecting pyphen<0.11.0,>=0.10.0
  Downloading Pyphen-0.10.0-py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 37.9 MB/s 
Collecting srsly<3.0.0,>=2.4.1
  Downloading srsly-2.4.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (451 kB)
[K     |████████████████████████████████| 451 kB 55.5 MB/s 
Collecting thinc<8.1.0,>=8.0.12
  Downloading thinc-8.0.13-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (628 kB)
[K     |████████████████████████████████| 628 kB 71.1 MB/s 
Collecting typer<0.5.0,>=0.3.0
  Downloading typer-0.4.0-py3-none-any.whl (27 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
from wordfreq import word_frequency
from scipy import stats
import csv
import spacy
from spacy_syllables import SpacySyllables
import random
import os

In [3]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2021-11-24 20:00:51--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-11-24 20:00:51--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-11-24 20:00:51--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-1

In [4]:
!unzip glove*.zip
!ls
!pwd

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       
glove.6B.100d.txt  glove.6B.300d.txt  glove.6B.zip
glove.6B.200d.txt  glove.6B.50d.txt   sample_data
/content


In [5]:
# https://www.kaggle.com/bminixhofer/deterministic-neural-networks-using-pytorch
# Seed all rngs for deterministic results
def seed_all(seed = 0):
  random.seed(0)
  os.environ['PYTHONHASHSEED'] = str(seed)
  torch.manual_seed(seed)
  np.random.seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic = True

In [6]:
seed_all(0)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [8]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("syllables", after='tagger') # Add the syllable tagger pipe


<spacy_syllables.SpacySyllables at 0x7f1e7e3283d0>

In [9]:
SINGLE_TRAIN_DATAPATH = "https://raw.githubusercontent.com/MMU-TDMLab/CompLex/master/train/lcp_single_train.tsv"
SINGLE_TEST_DATAPATH = "https://raw.githubusercontent.com/MMU-TDMLab/CompLex/master/test-labels/lcp_single_test.tsv"


In [10]:
def get_data_frames():
  df_train_single = pd.read_csv(SINGLE_TRAIN_DATAPATH, sep='\t', quotechar="'", quoting=csv.QUOTE_NONE)
  df_test_single = pd.read_csv(SINGLE_TEST_DATAPATH, sep='\t', quotechar="'", quoting=csv.QUOTE_NONE)

  
  return df_train_single, df_test_single

In [11]:
df_train_single, df_test_single = get_data_frames()

In [12]:
single_tokens_train_raw = df_train_single["token"].astype(str).to_list()
single_tokens_test_raw = df_test_single["token"].astype(str).to_list()

y_single_train = df_train_single["complexity"].astype(np.float32).to_numpy()
y_single_test = df_test_single["complexity"].astype(np.float32).to_numpy()

sent_train_single_raw = df_train_single["sentence"].to_list()
sent_test_single_raw = df_test_single["sentence"].to_list()



In [14]:
EMBEDDING_DIM = 50

def get_embeddings():
  embedding_index = {}
  with open('glove.6B.{}d.txt'.format(EMBEDDING_DIM), 'r', encoding='utf-8') as f:
    for line in f:
      values = line.split()
      token = values[0]
      embedding_index[token] = np.asarray(values[1:], dtype='float32')
  return embedding_index

In [15]:
embedding_index = get_embeddings()
print('Token count in embeddings: {}'.format(len(embedding_index)))

Token count in embeddings: 400000


In [16]:
HIDDEN_DIM = 10

In [17]:
def prepare_sequence(seq, to_ix):
  seq = seq.split()
  idxs = [to_ix[w.lower()] if w.lower() in to_ix else len(to_ix) for w in seq]
  idxs = torch.tensor(idxs)
  idxs = nn.functional.one_hot(idxs, num_classes=len(to_ix))
  idxs = torch.tensor(idxs, dtype=torch.float32)
  return idxs


def map_token_to_idx():
  word_to_ix = {}
  for sent in sent_train_single_raw:
    sent = sent.split()
    for word in sent:
      word = word.lower()
      if word not in word_to_ix:
        word_to_ix[word] = len(word_to_ix)

  
  
  return word_to_ix

In [18]:
word_to_ix = map_token_to_idx()
print('SWE vocab size: {}'.format(len(word_to_ix), ))

SWE vocab size: 24350


In [19]:
class biLSTM(nn.Module):
  def __init__(self, embedding_dim, hidden_dim, vocab_size, output_size):
    super(biLSTM, self).__init__()
    self.hidden_dim = hidden_dim
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
    self.hidden2tag = nn.Linear(2 * hidden_dim, output_size)

  def prepare_embedding(self, sentence):
    embeddings = []
    for word in sentence:
      word = word.lower()
      if word in embedding_index:
        embeddings.extend(embedding_index[word])
      else:
        embeddings.extend(np.random.random(EMBEDDING_DIM).tolist())
    embeddings = torch.tensor(embeddings, dtype=torch.float32, device=device)
    return embeddings

  def forward(self, sentence):
    sentence = sentence.split()
    embeds = self.prepare_embedding(sentence)
    lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
    tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
    tag_scores = F.softmax(tag_space, dim=1)
    return tag_scores

In [20]:
model = biLSTM(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(word_to_ix))


In [21]:
USE_PRETRAINED_SINGLE_WORD_TARGET_MODEL = False

In [22]:
if USE_PRETRAINED_SINGLE_WORD_TARGET_MODEL:
  print('Using pre-trained biLSTM on single target expressions')
  model = torch.load(path_biLSTM_single)
  model.eval()
else:
  print('Training biLSTM on single target expressions')
  # Train the model for 10 epochs
  model = biLSTM(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(word_to_ix))
  loss_function = nn.MSELoss()
  optimizer = optim.Adam(model.parameters(), lr=0.01)
  for epoch in range(1):
    loss_sum = 0
    for sentence in sent_train_single_raw:
      model.zero_grad()
      targets = prepare_sequence(sentence, word_to_ix)
      tag_scores = model(sentence)
      loss = loss_function(tag_scores, targets)
      loss_sum += loss
      loss.backward()
      optimizer.step()
    print('Epoch: {} Loss: {}'.format(epoch, loss_sum.item()))

Training biLSTM on single target expressions


  


Epoch: 0 Loss: 0.2653784155845642


In [23]:
def prepare_features_single_word(tokens, sentences):
  features = []
  for idx, word in enumerate(tokens):
    word = word.lower()
    feature = []

    # Word length
    feature.append(len(word))
    doc = nlp(word)

    # Syllable count and word frequency in the corpus
    # Spacy tokenizes the input sentence
    # In this case we would have only one token, the target word
    for token in doc:
      feature.append(token._.syllables_count)
      feature.append(word_frequency(word, 'en'))

    # Probability of target word `word` in the sentence estimated from by `model`
    if word in word_to_ix:
      # Output scores for each of the word in the sentence
      out = model(sentences[idx])
      pos = -1
      for itr, token in enumerate(sentences[idx].split()):
        if token.lower() == word:
          pos = itr
          break
      id_pos = word_to_ix[word] # word to id mapping
      feature.append(float(out[pos][id_pos]))
    else:
      # `word` not in vocabulary, so cannot predict probability in context
      feature.append(0.0)

    # GloVE embedding for the `word`
    if word in embedding_index:
      feature.extend(embedding_index[word].tolist())
    else:
      # `word` not in the GloVE corpus, take a random embedding
      feature.extend(np.random.random(EMBEDDING_DIM).tolist())
    features.append(feature)

    if (idx + 1) % 500 == 0:
      print('Prepared features for {} single target word sentences'.format(idx + 1))
  return features

In [24]:
print('+++ Generating Train features for Single word expressions +++')
features_train_single = prepare_features_single_word(single_tokens_train_raw, sent_train_single_raw)
print('+++ Generating Test features for Single word expressions +++')
features_test_single = prepare_features_single_word(single_tokens_test_raw, sent_test_single_raw)

+++ Generating Train features for Single word expressions +++
Prepared features for 500 single target word sentences
Prepared features for 1000 single target word sentences
Prepared features for 1500 single target word sentences
Prepared features for 2000 single target word sentences
Prepared features for 2500 single target word sentences
Prepared features for 3000 single target word sentences
Prepared features for 3500 single target word sentences
Prepared features for 4000 single target word sentences
Prepared features for 4500 single target word sentences
Prepared features for 5000 single target word sentences
Prepared features for 5500 single target word sentences
Prepared features for 6000 single target word sentences
Prepared features for 6500 single target word sentences
Prepared features for 7000 single target word sentences
Prepared features for 7500 single target word sentences
+++ Generating Test features for Single word expressions +++
Prepared features for 500 single targe

In [25]:
# Convert all features to torch.tensor to enable use in PyTorch models
X_train_single_tensor = torch.tensor(features_train_single, dtype=torch.float32, device=device)
X_test_single_tensor = torch.tensor(features_test_single, dtype=torch.float32, device=device)


In [26]:
# Reshape all output complexity scores to single dimension vectors
y_single_train = y_single_train.reshape(y_single_train.shape[0], -1)
y_single_test = y_single_test.reshape(y_single_test.shape[0], -1)


In [27]:
# Convert all target outputs to torch.tensor to enable use in PyTorch models
Y_train_single_tensor = torch.tensor(y_single_train, dtype=torch.float32, device=device)
Y_test_single_tensor = torch.tensor(y_single_test, dtype=torch.float32, device=device)


In [28]:
# Ensure each sample from test and train for single word expression is taken
print(X_train_single_tensor.shape)
print(X_test_single_tensor.shape)
print(Y_train_single_tensor.shape)
print(Y_test_single_tensor.shape)

torch.Size([7662, 54])
torch.Size([917, 54])
torch.Size([7662, 1])
torch.Size([917, 1])


In [29]:
def convert_tensor_to_np(y):
  if device == torch.device("cuda"):
    y = y.cpu()
  y = y.detach().numpy()
  return y

In [30]:
from copy import deepcopy

In [31]:
# Evaluate the metrics upon which the model would be evaluated
def evaluate_metrics(labels, predicted):
  vx, vy = [], []
  if torch.is_tensor(labels):
    vx = labels.clone()
    vx = convert_tensor_to_np(vx)
  else:
    vx = deepcopy(labels)
  if torch.is_tensor(predicted):
    vy = predicted.clone()
    vy = convert_tensor_to_np(vy)
  else:
    vy = deepcopy(predicted)

  pearsonR = np.corrcoef(vx.T, vy.T)[0, 1]
  spearmanRho = stats.spearmanr(vx, vy)
  MSE = np.mean((vx - vy) ** 2)
  MAE = np.mean(np.absolute(vx - vy))
  RSquared = pearsonR ** 2

  print("Peason's R: {}".format(pearsonR))
  print("Spearman's rho: {}".format(spearmanRho))
  print("R Squared: {}".format(RSquared))
  print("MSE: {}".format(MSE))
  print("MAE: {}".format(MAE))

In [32]:
X_train_single_np = np.array(features_train_single)
X_test_single_np = np.array(features_test_single)
Y_train_single_np = np.array(y_single_train.reshape(y_single_train.shape[0], -1))
Y_test_single_np = np.array(y_single_test.reshape(y_single_test.shape[0], -1))

In [33]:
print(X_train_single_np.shape)
print(X_test_single_np.shape)
print(Y_train_single_np.shape)
print(Y_test_single_np.shape)

(7662, 54)
(917, 54)
(7662, 1)
(917, 1)


In [34]:
#LR MODEL

In [35]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [36]:
from sklearn.linear_model import LinearRegression

In [37]:
def evaluateLinearRegression(X_train, Y_train, X_test, Y_test):
  reg = make_pipeline(StandardScaler(), LinearRegression())
  reg.fit(X_train, Y_train)
  out = reg.predict(X_test)
  out = out.reshape((out.shape[0], 1))
  evaluate_metrics(out, Y_test)
  return out

In [38]:
print('Linear Regression for Single word expressions')
out_LR = evaluateLinearRegression(X_train_single_np, Y_train_single_np, X_test_single_np, Y_test_single_np)

Linear Regression for Single word expressions
Peason's R: 0.7131966978387039
Spearman's rho: SpearmanrResult(correlation=0.6922427095275896, pvalue=9.064053173897807e-132)
R Squared: 0.5086495298080315
MSE: 0.00795479190989954
MAE: 0.06836953159067313


In [39]:
#SVR MODEL

In [40]:
from sklearn.svm import SVR

In [41]:
def evaluateSVR(X_train, Y_train, X_test, Y_test):
  svr = make_pipeline(StandardScaler(), SVR(C=0.05, epsilon=0.01))
  svr.fit(X_train, Y_train.reshape(-1))
  out = svr.predict(X_test)
  out = out.reshape((out.shape[0], 1))
  evaluate_metrics(out, Y_test)
  return out

In [42]:
print('SVR for Single word expressions')
out_svr = evaluateSVR(X_train_single_np, Y_train_single_np, X_test_single_np, Y_test_single_np)


SVR for Single word expressions
Peason's R: 0.7303058792979216
Spearman's rho: SpearmanrResult(correlation=0.7055868915357708, pvalue=5.038054858837159e-139)
R Squared: 0.5333466773371104
MSE: 0.007586501609013984
MAE: 0.06758995521244428


In [43]:
single_ids = df_test_single["id"].astype(str).to_list()


In [44]:
out_ensemble = []

for idx in range(len(out_LR)):
  score = 0
  score += float(out_LR[idx])
  score += float(out_svr[idx])
  if idx == 0:
    print(float(out_LR[idx]), float(out_svr[idx]), score / 2)
  score /= 2
  out_ensemble.append(score)
out_ensemble = np.array(out_ensemble)
out_ensemble = out_ensemble.reshape((out_ensemble.shape[0], 1))

0.11769032925997991 0.11241302335320352 0.11505167630659172


In [45]:
# Score from the Overall Model for Single Word Expressions
evaluate_metrics(out_ensemble, Y_test_single_np)

Peason's R: 0.7325897290253571
Spearman's rho: SpearmanrResult(correlation=0.7082096676011037, pvalue=1.6904547334692234e-140)
R Squared: 0.5366877110734461
MSE: 0.007527301605110034
MAE: 0.06697485526324143
