<a href="https://colab.research.google.com/github/vishesh711/AMS_691_NLP-HW1/blob/main/SUDHANSHUNLP_Assignment1_115768673_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Navigate to your file
file_path = '/content/drive/MyDrive/hw1-data'


Mounted at /content/drive


In [None]:
!pip install numpy scipy
import numpy as np
import scipy
from scipy.stats import spearmanr




In [None]:
from collections import defaultdict

def load_vocab(file_path):
    """Loads a vocabulary file where each line contains a word."""
    with open(file_path, 'r') as file:
        vocab = set([line.strip() for line in file])
    return vocab

def count_word_pairs(corpus_file, vocab_V, vocab_VC, window_size):
    """
    Count the number of times word y appears in a context window of size w centered at word x.
    :param corpus_file: Path to the corpus file.
    :param vocab_V: Vocabulary set for center words.
    :param vocab_VC: Vocabulary set for context words.
    :param window_size: Size of the context window (w).
    :return: A dictionary of word pair counts.
    """
    # Sparse data structure to store counts
    word_counts = defaultdict(lambda: defaultdict(int))

    # Process the corpus
    with open(corpus_file, 'r') as file:
        for line in file:
            words = line.strip().split()  # Tokenize the sentence
            for i, word in enumerate(words):
                if word not in vocab_V:
                    continue  # Skip words not in vocab_V (not valid center words)

                # Define context window
                start = max(i - window_size, 0)  # Ensure window doesn't go out of bounds
                end = min(i + window_size + 1, len(words))  # Same for the other side

                # Count co-occurrences
                for j in range(start, end):
                    if i == j:  # Skip the center word itself
                        continue
                    context_word = words[j]
                    if context_word in vocab_VC:
                        word_counts[word][context_word] += 1

    return word_counts

# Load vocabularies for V and VC
vocab_V = load_vocab('/content/drive/MyDrive/hw1-data/vocab-15kws.txt')  # Center words
vocab_VC = load_vocab('/content/drive/MyDrive/hw1-data/vocab-5k.txt')    # Context words

# Perform word pair counting for w = 3 and w = 6
word_counts_w1 = count_word_pairs('/content/drive/MyDrive/hw1-data/wiki-1percent.txt', vocab_V, vocab_VC, 1)
word_counts_w3 = count_word_pairs('/content/drive/MyDrive/hw1-data/wiki-1percent.txt', vocab_V, vocab_VC, 3)
word_counts_w6 = count_word_pairs('/content/drive/MyDrive/hw1-data/wiki-1percent.txt', vocab_V, vocab_VC, 6)


In [None]:
def report_word_pair_counts(word_counts, word_pairs):
    """
    Report the counts for a set of specific word pairs.
    :param word_counts: The nested dictionary of word counts.
    :param word_pairs: List of word pairs (x, y) to report counts for.
    :return: Prints the counts for the word pairs.
    """
    for word1, word2 in word_pairs:
        count = word_counts.get(word1, {}).get(word2, 0)  # Default to 0 if pair doesn't exist
        print(f"Count for ({word1}, {word2}): {count}")

# List of word pairs to check
word_pairs = [
    ('chicken', 'the'),
    ('chicken', 'wings'),
    ('chicago', 'chicago'),
    ('coffee', 'the'),
    ('coffee', 'cup'),
    ('coffee', 'coffee')
]

# Report counts for w = 3
print("Counts for w = 3:")
report_word_pair_counts(word_counts_w3, word_pairs)

# Report counts for w = 6
print("\nCounts for w = 6:")
report_word_pair_counts(word_counts_w6, word_pairs)


Counts for w = 3:
Count for (chicken, the): 52
Count for (chicken, wings): 6
Count for (chicago, chicago): 38
Count for (coffee, the): 95
Count for (coffee, cup): 10
Count for (coffee, coffee): 4

Counts for w = 6:
Count for (chicken, the): 103
Count for (chicken, wings): 7
Count for (chicago, chicago): 122
Count for (coffee, the): 201
Count for (coffee, cup): 14
Count for (coffee, coffee): 36


In [None]:
import numpy as np
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr

# Function to compute cosine similarity between two word vectors
def cosine_similarity(vec1, vec2):
    dot_product = sum(vec1[key] * vec2.get(key, 0) for key in vec1)
    norm1 = np.sqrt(sum(val ** 2 for val in vec1.values()))
    norm2 = np.sqrt(sum(val ** 2 for val in vec2.values()))
    if norm1 == 0 or norm2 == 0:
        return 0.0  # Return zero if either vector is zero-length
    return dot_product / (norm1 * norm2)

# Function to load word similarity datasets (MEN or SimLex-999)
def load_similarity_dataset(file_path):
    word_pairs = []
    human_scores = []
    with open(file_path, 'r') as file:
        next(file)  # Skip the header if the file has one
        for line in file:
            word1, word2, score = line.strip().split()
            word_pairs.append((word1, word2))
            human_scores.append(float(score))
    return word_pairs, human_scores

# Function to get word vector from counts
def get_word_vector(word, word_counts):
    # Return the word vector if it exists, otherwise return an empty dict
    return word_counts.get(word, {})

# Function to evaluate word vectors on a dataset
def evaluate_word_vectors(dataset_path, word_counts):
    word_pairs, human_scores = load_similarity_dataset(dataset_path)
    model_scores = []

    for word1, word2 in word_pairs:
        vec1 = get_word_vector(word1, word_counts)
        vec2 = get_word_vector(word2, word_counts)
        similarity = cosine_similarity(vec1, vec2)  # Compute cosine similarity
        model_scores.append(similarity)

    # Compute Spearman's rank correlation between human scores and model scores
    spearman_corr, _ = spearmanr(human_scores, model_scores)
    return spearman_corr

# Load MEN and SimLex-999 datasets
men_dataset = '/content/drive/MyDrive/hw1-data/men.txt'
simlex_dataset = '/content/drive/MyDrive/hw1-data/simlex-999.txt'

# Evaluate word vectors (for w = 3, using vocab-15kws.txt for V and vocab-5k.txt for VC)
spearman_corr_men = evaluate_word_vectors(men_dataset, word_counts_w3)
spearman_corr_simlex = evaluate_word_vectors(simlex_dataset, word_counts_w3)

# Print the results
print(f"Spearman correlation for MEN dataset (w = 3): {spearman_corr_men}")
print(f"Spearman correlation for SimLex-999 dataset (w = 3): {spearman_corr_simlex}")


Spearman correlation for MEN dataset (w = 3): 0.2251396048448754
Spearman correlation for SimLex-999 dataset (w = 3): 0.05876135331349779


In [None]:
import numpy as np
from collections import defaultdict
from scipy.stats import spearmanr

# Function to count occurrences of each word in sentences
def count_word_sentence_occurrences(corpus_file):
    sentence_counts = defaultdict(int)
    with open(corpus_file, 'r') as file:
        for line in file:
            words = set(line.strip().split())  # Treat sentence as a set of unique words
            for word in words:
                sentence_counts[word] += 1
    return sentence_counts

# Function to calculate IDF-weighted word vectors
def calculate_idf_weighted_vectors(corpus_file, word_counts, vocab_VC):
    # Count the total number of sentences
    total_sentences = sum(1 for _ in open(corpus_file, 'r'))

    # Count the number of sentences each word in the context vocabulary appears in
    sentence_counts = count_word_sentence_occurrences(corpus_file)

    # Calculate IDF-weighted vectors
    idf_weighted_vectors = defaultdict(lambda: defaultdict(float))
    for word_x, context_dict in word_counts.items():
        for word_y, count in context_dict.items():
            # Compute IDF
            idf = total_sentences / (sentence_counts[word_y] + 1)  # Avoid division by zero
            # Apply IDF to the count
            idf_weighted_vectors[word_x][word_y] = count * idf

    return idf_weighted_vectors


# Calculate the IDF-weighted vectors for w = 3
idf_word_counts_w1 = calculate_idf_weighted_vectors('/content/drive/MyDrive/hw1-data/wiki-1percent.txt', word_counts_w1, vocab_VC)
idf_word_counts_w3 = calculate_idf_weighted_vectors('/content/drive/MyDrive/hw1-data/wiki-1percent.txt', word_counts_w3, vocab_VC)
idf_word_counts_w6 = calculate_idf_weighted_vectors('/content/drive/MyDrive/hw1-data/wiki-1percent.txt', word_counts_w6, vocab_VC)

# Evaluate the IDF-weighted word vectors using EVALWS
spearman_corr_idf_men_w3 = evaluate_word_vectors(men_dataset, idf_word_counts_w3)
spearman_corr_idf_simlex_w3 = evaluate_word_vectors(simlex_dataset, idf_word_counts_w3)

# Print the results
print(f"Spearman correlation for MEN dataset (w = 3, IDF): {spearman_corr_idf_men_w3}")
print(f"Spearman correlation for SimLex-999 dataset (w = 3, IDF): {spearman_corr_idf_simlex_w3}")


Spearman correlation for MEN dataset (w = 3, IDF): 0.47285107067097226
Spearman correlation for SimLex-999 dataset (w = 3, IDF): 0.16436460954112178


In [None]:
import numpy as np
from collections import defaultdict
from math import log2

# Function to calculate PMI values for word pairs
def calculate_pmi(word_counts, vocab_V, vocab_VC):
    total_count = sum(sum(context_dict.values()) for context_dict in word_counts.values())  # Total N
    pmi_vectors = defaultdict(lambda: defaultdict(float))

    # Compute marginal probabilities
    p_x = defaultdict(float)
    p_y = defaultdict(float)

    for word_x, context_dict in word_counts.items():
        p_x[word_x] = sum(context_dict.values()) / total_count
        for word_y, count in context_dict.items():
            p_y[word_y] += count / total_count

    # Compute PMI for each word pair
    for word_x, context_dict in word_counts.items():
        for word_y, count in context_dict.items():
            joint_prob = count / total_count
            if joint_prob > 0 and p_x[word_x] > 0 and p_y[word_y] > 0:
                pmi = log2(joint_prob / (p_x[word_x] * p_y[word_y]))
                pmi_vectors[word_x][word_y] = pmi

    return pmi_vectors

# Compute PMI for w = 3
pmi_word_counts_w1 = compute_pmi(word_counts_w1, vocab_V, vocab_VC)
pmi_word_counts_w3 = compute_pmi(word_counts_w3, vocab_V, vocab_VC)
pmi_word_counts_w6 = compute_pmi(word_counts_w6, vocab_V, vocab_VC)


# Get PMI for "coffee"
coffee_pmi = pmi_word_counts_w3.get("coffee", {})
largest_pmi = sorted(coffee_pmi.items(), key=lambda x: x[1], reverse=True)[:10]
smallest_pmi = sorted(coffee_pmi.items(), key=lambda x: x[1])[:10]

print("Top 10 context words with largest PMIs for 'coffee':")
for word, pmi in largest_pmi:
    print(f"{word}: {pmi}")

print("\nTop 10 context words with smallest PMIs for 'coffee':")
for word, pmi in smallest_pmi:
    print(f"{word}: {pmi}")


Top 10 context words with largest PMIs for 'coffee':
tea: 8.314281269835263
drinking: 7.71405449753307
costa: 7.672608125452632
shop: 7.5688741687808
shops: 7.363997029461911
sugar: 6.704091030083928
coffee: 6.6773826307729784
mix: 6.252416014250404
seattle: 6.04743034466904
houses: 5.982077807291471

Top 10 context words with smallest PMIs for 'coffee':
he: -2.161333783867941
be: -2.060526683360527
this: -1.9005260281642615
had: -1.8633009473814206
not: -1.8321233786639186
its: -1.7043825744297731
after: -1.4689138836692859
more: -1.3633112453095726
when: -1.2681226416703757
page: -1.2479218069054545


In [None]:
def create_pmi_word_vectors(pmi_values, vocab_V, vocab_VC):
    pmi_vectors = defaultdict(dict)
    for x in vocab_V:
        for y in vocab_VC:
            pmi_vectors[x][y] = pmi_values[x].get(y, 0)
    return pmi_vectors

# Create PMI-based word vectors
pmi_word_vectors_w1 = create_pmi_word_vectors(pmi_word_counts_w1, vocab_V, vocab_VC)
pmi_word_vectors_w3 = create_pmi_word_vectors(pmi_word_counts_w3, vocab_V, vocab_VC)
pmi_word_vectors_w6 = create_pmi_word_vectors(pmi_word_counts_w6, vocab_V, vocab_VC)

# Evaluate PMI-based word vectors using EVALWS
spearman_corr_pmi_men_w3 = evaluate_word_vectors(men_dataset, pmi_word_vectors_w3)
spearman_corr_pmi_simlex_w3 = evaluate_word_vectors(simlex_dataset, pmi_word_vectors_w3)

# Print the results
print(f"Spearman correlation for MEN dataset (w = 3, PMI): {spearman_corr_pmi_men_w3}")
print(f"Spearman correlation for SimLex-999 dataset (w = 3, PMI): {spearman_corr_pmi_simlex_w3}")


Spearman correlation for MEN dataset (w = 3, PMI): 0.46673753829720827
Spearman correlation for SimLex-999 dataset (w = 3, PMI): 0.1860041939515375


In [None]:
import pandas as pd

# Function to evaluate word vectors across different methods and window sizes
def evaluate_all_methods(men_dataset, simlex_dataset, word_counts_w1, word_counts_w3, word_counts_w6,
                         idf_word_counts_w1, idf_word_counts_w3, idf_word_counts_w6,
                         pmi_word_vectors_w1, pmi_word_vectors_w3, pmi_word_vectors_w6):

    # List to store results
    results = []

    # Loop through each method and window size
    for method, word_vectors_w1, word_vectors_w3, word_vectors_w6 in [
        ('Counts', word_counts_w1, word_counts_w3, word_counts_w6),
        ('IDF', idf_word_counts_w1, idf_word_counts_w3, idf_word_counts_w6),
        ('PMI', pmi_word_vectors_w1, pmi_word_vectors_w3, pmi_word_vectors_w6)]:

        for w, word_vectors in [(1, word_vectors_w1), (3, word_vectors_w3), (6, word_vectors_w6)]:
            # Evaluate on MEN dataset
            spearman_corr_men = evaluate_word_vectors(men_dataset, word_vectors)
            # Evaluate on SimLex-999 dataset
            spearman_corr_simlex = evaluate_word_vectors(simlex_dataset, word_vectors)

            # Store results
            results.append({
                'Method': method,
                'Window Size': w,
                'MEN Correlation': spearman_corr_men,
                'SimLex Correlation': spearman_corr_simlex
            })

    return results

# Evaluate all combinations of methods and window sizes
results = evaluate_all_methods(
    men_dataset, simlex_dataset,
    word_counts_w1, word_counts_w3, word_counts_w6,  # Raw counts for w = 1, 3, 6
    idf_word_counts_w1, idf_word_counts_w3, idf_word_counts_w6,  # IDF for w = 1, 3, 6
    pmi_word_vectors_w1, pmi_word_vectors_w3, pmi_word_vectors_w6  # PMI for w = 1, 3, 6
)

# Display results in a structured format
df_results = pd.DataFrame(results)
print(df_results)

# Analyze the results for trends
def analyze_trends(df_results):
    # Find the best performing method for each window size
    best_method_men = df_results.groupby('Window Size')['MEN Correlation'].idxmax()
    best_method_simlex = df_results.groupby('Window Size')['SimLex Correlation'].idxmax()

    # Display best methods for each window size for MEN and SimLex
    print("\nBest methods for MEN dataset by window size:")
    print(df_results.loc[best_method_men])

    print("\nBest methods for SimLex-999 dataset by window size:")
    print(df_results.loc[best_method_simlex])

    # Average performance of each method
    avg_performance = df_results.groupby('Method').mean()
    print("\nAverage performance of each method across all window sizes:")
    print(avg_performance)

# Analyze the trends
analyze_trends(df_results)


   Method  Window Size  MEN Correlation  SimLex Correlation
0  Counts            1         0.209092            0.067786
1  Counts            3         0.225140            0.058761
2  Counts            6         0.241067            0.044696
3     IDF            1         0.347589            0.189255
4     IDF            3         0.472851            0.164365
5     IDF            6         0.532401            0.110635
6     PMI            1         0.439174            0.229926
7     PMI            3         0.466738            0.186004
8     PMI            6         0.474624            0.149922

Best methods for MEN dataset by window size:
  Method  Window Size  MEN Correlation  SimLex Correlation
6    PMI            1         0.439174            0.229926
4    IDF            3         0.472851            0.164365
5    IDF            6         0.532401            0.110635

Best methods for SimLex-999 dataset by window size:
  Method  Window Size  MEN Correlation  SimLex Correlation
6    P

In [None]:
import numpy as np

# Function to convert PMI word vectors (dictionaries) to numerical vectors (arrays)
def convert_to_vector(word, word_vectors, vocab_VC):
    vector = np.zeros(len(vocab_VC))  # Initialize vector with zeros
    for i, context_word in enumerate(vocab_VC):
        vector[i] = word_vectors[word].get(context_word, 0)  # Fill in the PMI values
    return vector

# Updated cosine similarity function using vector representations
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Function to get the k nearest neighbors of a word using vector representation
def get_nearest_neighbors(word, word_vectors, vocab_V, vocab_VC, k=10):
    if word not in word_vectors:
        print(f"Word '{word}' not found in word vectors.")
        return []

    word_vector = convert_to_vector(word, word_vectors, vocab_VC)

    similarities = []
    for other_word in vocab_V:
        if other_word != word and other_word in word_vectors:
            other_word_vector = convert_to_vector(other_word, word_vectors, vocab_VC)
            similarity = cosine_similarity(word_vector, other_word_vector)
            similarities.append((other_word, similarity))

    # Sort by similarity and return the top k
    nearest_neighbors = sorted(similarities, key=lambda x: x[1], reverse=True)[:k]
    return nearest_neighbors

# Function to print the nearest neighbors for a word with two different window sizes
def compare_nearest_neighbors(word, word_vectors_w1, word_vectors_w6, vocab_V, vocab_VC, k=10):
    print(f"Nearest neighbors for '{word}' (w = 1):")
    neighbors_w1 = get_nearest_neighbors(word, word_vectors_w1, vocab_V, vocab_VC, k)
    for neighbor, similarity in neighbors_w1:
        print(f"{neighbor}: {similarity:.4f}")

    print(f"\nNearest neighbors for '{word}' (w = 6):")
    neighbors_w6 = get_nearest_neighbors(word, word_vectors_w6, vocab_V, vocab_VC, k)
    for neighbor, similarity in neighbors_w6:
        print(f"{neighbor}: {similarity:.4f}")

# Example: Compare nearest neighbors for the word "judges"
word = "judges"
compare_nearest_neighbors(word, pmi_word_vectors_w1, pmi_word_vectors_w6, vocab_V, vocab_VC)


Nearest neighbors for 'judges' (w = 1):


  return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


judge: 0.2221
players: 0.2141
appeals: 0.1902
officials: 0.1832
ministers: 0.1821
justices: 0.1794
members: 0.1793
leaders: 0.1737
contestants: 0.1685
unanimously: 0.1683

Nearest neighbors for 'judges' (w = 6):
judge: 0.3091
jury: 0.2898
appeals: 0.2761
courts: 0.2755
panel: 0.2742
supreme: 0.2711
contestants: 0.2583
candidates: 0.2531
appeal: 0.2506
officials: 0.2499


In [None]:
# List of query words from different parts of speech
query_words = {
    "nouns": ["judge", "court"],
    "verbs": ["decide", "argue"],
    "adjectives": ["big", "important"],
    "prepositions": ["in", "on"]
}

# Analyze nearest neighbors for each POS category
def analyze_nearest_neighbors_by_pos(query_words, word_vectors_w1, word_vectors_w6, vocab_V, vocab_VC, k=10):
    for pos, words in query_words.items():
        print(f"\nAnalyzing {pos} words:")
        for word in words:
            print(f"\nQuery word: '{word}'")
            compare_nearest_neighbors(word, word_vectors_w1, word_vectors_w6, vocab_V, vocab_VC, k)

# Example: Analyze nearest neighbors for nouns, verbs, adjectives, and prepositions
analyze_nearest_neighbors_by_pos(query_words, pmi_word_vectors_w1, pmi_word_vectors_w6, vocab_V, vocab_VC)



Analyzing nouns words:

Query word: 'judge'
Nearest neighbors for 'judge' (w = 1):


  return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


captain: 0.2701
smith: 0.2405
king: 0.2327
professor: 0.2320
george: 0.2301
clarke: 0.2235
judges: 0.2221
joseph: 0.2196
architect: 0.2136
actor: 0.2116

Nearest neighbors for 'judge' (w = 6):
supreme: 0.3979
attorney: 0.3812
court: 0.3665
governor: 0.3584
mayor: 0.3280
appeals: 0.3266
lawyer: 0.3225
secretary: 0.3216
criminal: 0.3129
chief: 0.3125

Query word: 'court'
Nearest neighbors for 'court' (w = 1):
courts: 0.3719
government: 0.2631
council: 0.2617
law: 0.2419
judge: 0.2256
commission: 0.2074
tribunal: 0.2003
committee: 0.1988
state: 0.1980
appeals: 0.1966

Nearest neighbors for 'court' (w = 6):
supreme: 0.4536
judge: 0.3665
law: 0.3444
appeals: 0.3325
president: 0.3160
governor: 0.3045
council: 0.3042
courts: 0.2932
office: 0.2895
government: 0.2863

Analyzing verbs words:

Query word: 'decide'
Nearest neighbors for 'decide' (w = 1):
choose: 0.3286
understand: 0.3199
determine: 0.3091
observe: 0.2880
tell: 0.2868
remember: 0.2850
specify: 0.2849
explain: 0.2839
ask: 0.2827
kno

In [None]:
# Example: Compare nearest neighbors for multisense word "bank"
multisense_word = "bank"
compare_nearest_neighbors(multisense_word, pmi_word_vectors_w1, pmi_word_vectors_w6, vocab_V, vocab_VC)

Nearest neighbors for 'bank' (w = 1):


  return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


railway: 0.2061
side: 0.2032
coast: 0.2006
banks: 0.2002
park: 0.1996
corporation: 0.1933
africa: 0.1906
property: 0.1874
railroad: 0.1855
insurance: 0.1816

Nearest neighbors for 'bank' (w = 6):
corporation: 0.3778
capital: 0.3674
railway: 0.3391
banks: 0.3227
branch: 0.3221
valley: 0.3181
southern: 0.3159
trade: 0.3111
largest: 0.3105
centre: 0.3098
