# a)

# i) Constrained Task

In [6]:
import nltk
from nltk.corpus import brown
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from scipy.stats import pearsonr, spearmanr

# Download necessary resources from NLTK
nltk.download('brown')
nltk.download('punkt')

# Load Brown Corpus sentences
all_sentences = brown.sents()

# Function to clean text data (without stopword removal)
def clean_text(sentence):
    cleaned_words = []
    for word in sentence:
        word = word.lower()  # Convert each word to lowercase
        word = ''.join([c for c in word if c.isalpha() or c.isspace()])  # Keep only alphabets
        cleaned_words.append(word)
    return cleaned_words

# Select a subset of sentences
max_tokens = 100000
current_token_count = 0
subset_sentences = []
for sentence in all_sentences:
    clean_sentence = clean_text(sentence)
    current_token_count += len(clean_sentence)
    if current_token_count <= max_tokens:
        subset_sentences.append(clean_sentence)
    else:
        break

# Create co-occurrence matrix
window_size = 5
cooccurrence_matrix = Counter()
vocab = set()
for sentence in subset_sentences:
    for i, word in enumerate(sentence):
        vocab.add(word)
        for j in range(1, window_size + 1):
            if i + j < len(sentence):
                cooccurrence_matrix[(word, sentence[i + j])] += 1
                cooccurrence_matrix[(sentence[i + j], word)] += 1

vocab = list(vocab)
vocab_index = {word: idx for idx, word in enumerate(vocab)}

# Convert matrix to numpy array
matrix = np.zeros((len(vocab), len(vocab)))
for (w1, w2), count in cooccurrence_matrix.items():
    if w1 in vocab_index and w2 in vocab_index:
        i, j = vocab_index[w1], vocab_index[w2]
        matrix[i, j] = count

from sklearn.utils.extmath import randomized_svd
from sklearn.decomposition import TruncatedSVD

# Reduce dimensionality with Truncated SVD
svd = TruncatedSVD(n_components=100, random_state=42)
reduced_matrix = svd.fit_transform(matrix)

# Word vector lookup function
def get_word_vector(word):
    if word in vocab_index:
        i = vocab_index[word]
        return reduced_matrix[i]
    else:
        return np.zeros(100)  # Handle unknown words (adjust as needed)

# Load SimLex-999 data
simlex_999 = []
with open('/content/drive/MyDrive/IIITH/SimLex-999.txt', 'r') as f:
    next(f)  # Skip the header
    for line in f:
        parts = line.split()
        if len(parts) >= 4:
            word1, word2, _, score = parts[:4]
            try:
                score = float(score)
                simlex_999.append((word1, word2, score))
            except ValueError:
                # Skip the row if the score is not a valid float
                continue

# Calculate similarity scores
predicted_similarities = []
true_similarities = []
for (word1, word2, true_score) in simlex_999:
    vec1 = get_word_vector(word1)
    vec2 = get_word_vector(word2)
    if not np.array_equal(vec1, np.zeros(100)) and not np.array_equal(vec2, np.zeros(100)):
        sim_score = cosine_similarity([vec1], [vec2])[0][0]
        predicted_similarities.append(sim_score)
        true_similarities.append(true_score)

# Evaluate
pearson_corr = pearsonr(predicted_similarities, true_similarities)
spearman_corr = spearmanr(predicted_similarities, true_similarities)

print(f'Pearson Correlation: {pearson_corr}')
print(f'Spearman Correlation: {spearman_corr}')


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Pearson Correlation: PearsonRResult(statistic=0.03165291944720565, pvalue=0.44090422561775316)
Spearman Correlation: SignificanceResult(statistic=0.022897369638845, pvalue=0.5772367801505662)


# ii) Unconstrained Task

In [1]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr, spearmanr

simlex_999 = []
with open('/content/drive/MyDrive/IIITH/SimLex-999.txt', 'r') as f:
    next(f)  # Skip the header
    for line in f:
        parts = line.split()
        if len(parts) >= 4:
            word1, word2, _, score = parts[:4]
            try:
                score = float(score)
                simlex_999.append((word1, word2, score))
            except ValueError:
                # Skip the row if the score is not a valid float
                continue

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_word_embedding(word):
    inputs = tokenizer(word, return_tensors='pt')
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Calculate similarity scores
predicted_similarities = []
true_similarities = []
for (word1, word2, true_score) in simlex_999:
    vec1 = get_word_embedding(word1)
    vec2 = get_word_embedding(word2)
    sim_score = cosine_similarity(vec1, vec2)[0][0]
    predicted_similarities.append(sim_score)
    true_similarities.append(true_score)

# Evaluate
pearson_corr = pearsonr(predicted_similarities, true_similarities)
spearman_corr = spearmanr(predicted_similarities, true_similarities)

print(f'Pearson Correlation: {pearson_corr}')
print(f'Spearman Correlation: {spearman_corr}')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Pearson Correlation: PearsonRResult(statistic=0.12532655981885857, pvalue=7.130099348162466e-05)
Spearman Correlation: SignificanceResult(statistic=0.15897892683084308, pvalue=4.4018414716098324e-07)


# b)