<a href="https://colab.research.google.com/github/vishesh711/AMS_691_NLP/blob/main/AMS691_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###1. Distributional Counting

1.1 Implement Distributional Counting

In [3]:
from collections import defaultdict

def load_vocab(file_path):
    with open(file_path, 'r') as file:
        vocab = set([line.strip() for line in file])
    return vocab

def count_word_pairs(corpus_file, vocab_V, vocab_VC, window_size):
    # Sparse data structure to store counts
    word_counts = defaultdict(lambda: defaultdict(int))

    # Process the corpus
    with open(corpus_file, 'r') as file:
        for line in file:
            words = line.strip().split()  # Tokenize the sentence
            for i, word in enumerate(words):
                if word not in vocab_V:
                    continue

                # Define context window
                start = max(i - window_size, 0)
                end = min(i + window_size + 1, len(words))

                # Count co-occurrences
                for j in range(start, end):
                    if i == j or words[j] not in vocab_VC:
                        continue
                    word_counts[word][words[j]] += 1

    return word_counts

# Load vocabularies
vocab_V = load_vocab('/content/vocab-15kws.txt')
vocab_VC = load_vocab('/content/vocab-5k.txt')

# Perform word pair counting for w = 3 and w = 6
word_counts_w3 = count_word_pairs('/content/wiki-1percent.txt', vocab_V, vocab_VC, 3)
word_counts_w6 = count_word_pairs('/content/wiki-1percent.txt', vocab_V, vocab_VC, 6)

# Example: Output the counts for specific word pairs
print(f"w=3: (chicken, the) = {word_counts_w3['chicken']['the']}")
print(f"w=6: (chicken, the) = {word_counts_w6['chicken']['the']}")


w=3: (chicken, the) = 52
w=6: (chicken, the) = 103


1.2

In [5]:
# Extract and print counts for specific word pairs
word_pairs = [
    ('chicken', 'the'),
    ('chicken', 'wings'),
    ('chicago', 'chicago'),
    ('coffee', 'the'),
    ('coffee', 'cup'),
    ('coffee', 'coffee')
]

print("Counts for w = 3")
for word1, word2 in word_pairs:
    count_w3 = word_counts_w3.get(word1, {}).get(word2, 0)
    print(f"({word1}, {word2}) = {count_w3}")

print("\nCounts for w = 6")
for word1, word2 in word_pairs:
    count_w6 = word_counts_w6.get(word1, {}).get(word2, 0)
    print(f"({word1}, {word2}) = {count_w6}")


Counts for w = 3
(chicken, the) = 52
(chicken, wings) = 6
(chicago, chicago) = 38
(coffee, the) = 95
(coffee, cup) = 10
(coffee, coffee) = 4

Counts for w = 6
(chicken, the) = 103
(chicken, wings) = 7
(chicago, chicago) = 122
(coffee, the) = 201
(coffee, cup) = 14
(coffee, coffee) = 36


1.3

In [6]:
import numpy as np
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr

# Function to compute cosine similarity between two word vectors
def cosine_similarity(vec1, vec2):
    dot_product = sum(vec1[key] * vec2.get(key, 0) for key in vec1)
    norm1 = np.sqrt(sum(val ** 2 for val in vec1.values()))
    norm2 = np.sqrt(sum(val ** 2 for val in vec2.values()))
    if norm1 == 0 or norm2 == 0:
        return 0.0
    return dot_product / (norm1 * norm2)

# Function to load word similarity datasets (MEN or SimLex-999)
def load_similarity_dataset(file_path):
    word_pairs = []
    human_scores = []
    with open(file_path, 'r') as file:
        next(file)  # Skip the header
        for line in file:
            word1, word2, score = line.strip().split()
            word_pairs.append((word1, word2))
            human_scores.append(float(score))
    return word_pairs, human_scores

# Function to get word vector from counts
def get_word_vector(word, word_counts):
    return word_counts.get(word, {})

# Function to evaluate word vectors on a dataset
def evaluate_word_vectors(dataset_path, word_counts):
    word_pairs, human_scores = load_similarity_dataset(dataset_path)
    model_scores = []

    for word1, word2 in word_pairs:
        vec1 = get_word_vector(word1, word_counts)
        vec2 = get_word_vector(word2, word_counts)
        similarity = cosine_similarity(vec1, vec2)
        model_scores.append(similarity)

    # Compute Spearman's rank correlation
    spearman_corr, _ = spearmanr(human_scores, model_scores)
    return spearman_corr

# Load MEN and SimLex-999 datasets
men_dataset = '/content/men.txt'
simlex_dataset = '/content/simlex-999.txt'

# Evaluate word vectors (for w = 3, using vocab-15kws.txt for V and vocab-5k.txt for VC)
spearman_corr_men = evaluate_word_vectors(men_dataset, word_counts_w3)
spearman_corr_simlex = evaluate_word_vectors(simlex_dataset, word_counts_w3)

# Print the results
print(f"Spearman correlation for MEN dataset (w = 3): {spearman_corr_men}")
print(f"Spearman correlation for SimLex-999 dataset (w = 3): {spearman_corr_simlex}")

Spearman correlation for MEN dataset (w = 3): 0.2251396048448754
Spearman correlation for SimLex-999 dataset (w = 3): 0.05876135331349779


###Part 2: Combining Counts with Inverse Document Frequency (IDF)

2.1

In [7]:
from collections import defaultdict
import numpy as np

# Function to count occurrences of each word in sentences
def count_word_sentence_occurrences(corpus_file):
    sentence_counts = defaultdict(int)
    with open(corpus_file, 'r') as file:
        for line in file:
            words = set(line.strip().split())  # Treat sentence as a set of unique words
            for word in words:
                sentence_counts[word] += 1
    return sentence_counts

# Function to calculate IDF-weighted word vectors
def calculate_idf_weighted_vectors(corpus_file, word_counts, vocab_VC):
    # Count the total number of sentences
    total_sentences = sum(1 for _ in open(corpus_file, 'r'))

    # Count the number of sentences each word in the context vocabulary appears in
    sentence_counts = count_word_sentence_occurrences(corpus_file)

    # Calculate IDF-weighted vectors
    idf_weighted_vectors = defaultdict(lambda: defaultdict(float))
    for word_x, context_dict in word_counts.items():
        for word_y, count in context_dict.items():
            # Compute IDF
            idf = total_sentences / (sentence_counts[word_y] + 1)  # Avoid division by zero
            # Apply IDF to the count
            idf_weighted_vectors[word_x][word_y] = count * idf

    return idf_weighted_vectors

# Calculate the IDF-weighted vectors for w = 3
idf_word_counts_w3 = calculate_idf_weighted_vectors('/content/wiki-1percent.txt', word_counts_w3, vocab_VC)
idf_word_counts_w6 = calculate_idf_weighted_vectors('/content/wiki-1percent.txt', word_counts_w6, vocab_VC)

# Evaluate the IDF-weighted word vectors using EVALWS (Spearman correlation)
spearman_corr_idf_men_w3 = evaluate_word_vectors(men_dataset, idf_word_counts_w3)
spearman_corr_idf_simlex_w3 = evaluate_word_vectors(simlex_dataset, idf_word_counts_w3)

# Print the results
print(f"Spearman correlation for MEN dataset (w = 3, IDF): {spearman_corr_idf_men_w3}")
print(f"Spearman correlation for SimLex-999 dataset (w = 3, IDF): {spearman_corr_idf_simlex_w3}")


Spearman correlation for MEN dataset (w = 3, IDF): 0.47285107067097226
Spearman correlation for SimLex-999 dataset (w = 3, IDF): 0.16436460954112178


###Part 3: Pointwise Mutual Information (PMI)

3.1

In [8]:
import numpy as np
from collections import defaultdict
from math import log2

# Function to calculate PMI values for word pairs
def calculate_pmi(word_counts, vocab_V, vocab_VC):
    total_count = sum(sum(context_dict.values()) for context_dict in word_counts.values())  # Total N
    pmi_vectors = defaultdict(lambda: defaultdict(float))

    # Compute marginal probabilities
    p_x = defaultdict(float)
    p_y = defaultdict(float)

    for word_x, context_dict in word_counts.items():
        p_x[word_x] = sum(context_dict.values()) / total_count
        for word_y, count in context_dict.items():
            p_y[word_y] += count / total_count

    # Compute PMI for each word pair
    for word_x, context_dict in word_counts.items():
        for word_y, count in context_dict.items():
            joint_prob = count / total_count
            if joint_prob > 0 and p_x[word_x] > 0 and p_y[word_y] > 0:
                pmi = log2(joint_prob / (p_x[word_x] * p_y[word_y]))
                pmi_vectors[word_x][word_y] = pmi

    return pmi_vectors

# Calculate PMI-based word vectors for w = 3
pmi_word_vectors_w3 = calculate_pmi(word_counts_w3, vocab_V, vocab_VC)

# Calculate PMI-based word vectors for w = 6
pmi_word_vectors_w6 = calculate_pmi(word_counts_w6, vocab_V, vocab_VC)

# Function to print the top 10 context words with the largest and smallest PMI values for "coffee"
def print_top_pmi_words(word, pmi_vectors, top_n=10):
    if word not in pmi_vectors:
        print(f"Word '{word}' not found in PMI vectors.")
        return

    sorted_pmi = sorted(pmi_vectors[word].items(), key=lambda x: x[1], reverse=True)
    print(f"\nTop {top_n} words with highest PMI for '{word}':")
    for context_word, pmi_value in sorted_pmi[:top_n]:
        print(f"{context_word}: {pmi_value:.4f}")

    print(f"\nTop {top_n} words with lowest PMI for '{word}':")
    for context_word, pmi_value in sorted_pmi[-top_n:]:
        print(f"{context_word}: {pmi_value:.4f}")

# Print top 10 words with highest and lowest PMI for "coffee"
print_top_pmi_words('coffee', pmi_word_vectors_w3)



Top 10 words with highest PMI for 'coffee':
tea: 8.1660
drinking: 7.5880
shop: 7.4117
costa: 7.3503
shops: 7.2608
sugar: 6.5339
coffee: 6.5020
mix: 6.1312
seattle: 5.9508
houses: 5.8682

Top 10 words with lowest PMI for 'coffee':
page: -1.2806
when: -1.4043
more: -1.4785
after: -1.5985
its: -1.8395
not: -1.9116
this: -1.9795
had: -1.9875
be: -2.1510
he: -2.2603


3.2

In [9]:
# Evaluate PMI-based word vectors using EVALWS (Spearman correlation)
spearman_corr_pmi_men_w3 = evaluate_word_vectors(men_dataset, pmi_word_vectors_w3)
spearman_corr_pmi_simlex_w3 = evaluate_word_vectors(simlex_dataset, pmi_word_vectors_w3)

# Print the results for PMI-based word vectors
print(f"Spearman correlation for MEN dataset (w = 3, PMI): {spearman_corr_pmi_men_w3}")
print(f"Spearman correlation for SimLex-999 dataset (w = 3, PMI): {spearman_corr_pmi_simlex_w3}")

Spearman correlation for MEN dataset (w = 3, PMI): 0.46563240836038006
Spearman correlation for SimLex-999 dataset (w = 3, PMI): 0.18643183126956037


###Part 4: Quantitative Comparisons

4.1 and 4.2

In [12]:
# Function to evaluate word vectors across different methods and window sizes
def evaluate_all_methods(men_dataset, simlex_dataset, word_counts_w1, word_counts_w3, word_counts_w6,
                         idf_word_counts_w1, idf_word_counts_w3, idf_word_counts_w6,
                         pmi_word_vectors_w1, pmi_word_vectors_w3, pmi_word_vectors_w6):

    # List to store results
    results = []

    # Loop through each method and window size
    for method, word_vectors_w1, word_vectors_w3, word_vectors_w6 in [
        ('Counts', word_counts_w1, word_counts_w3, word_counts_w6),
        ('IDF', idf_word_counts_w1, idf_word_counts_w3, idf_word_counts_w6),
        ('PMI', pmi_word_vectors_w1, pmi_word_vectors_w3, pmi_word_vectors_w6)]:

        for w, word_vectors in [(1, word_vectors_w1), (3, word_vectors_w3), (6, word_vectors_w6)]:
            # Evaluate on MEN dataset
            spearman_corr_men = evaluate_word_vectors(men_dataset, word_vectors)
            # Evaluate on SimLex-999 dataset
            spearman_corr_simlex = evaluate_word_vectors(simlex_dataset, word_vectors)

            # Store results
            results.append({
                'Method': method,
                'Window Size': w,
                'MEN Correlation': spearman_corr_men,
                'SimLex Correlation': spearman_corr_simlex
            })

    return results

# Now run the evaluation for all combinations of methods and window sizes
results = evaluate_all_methods(
    men_dataset, simlex_dataset,
    word_counts_w1, word_counts_w3, word_counts_w6,  # Raw counts for w = 1, 3, 6
    idf_word_counts_w1, idf_word_counts_w3, idf_word_counts_w6,  # IDF for w = 1, 3, 6
    pmi_word_vectors_w1, pmi_word_vectors_w3, pmi_word_vectors_w6  # PMI for w = 1, 3, 6
)

# Display results in a structured format
import pandas as pd
df_results = pd.DataFrame(results)
print(df_results)

# Analyze the results for trends

def analyze_trends(df_results):
    # Find the best performing method for each window size
    best_method_men = df_results.groupby('Window Size')['MEN Correlation'].idxmax()
    best_method_simlex = df_results.groupby('Window Size')['SimLex Correlation'].idxmax()

    # Display best methods for each window size for MEN and SimLex
    print("\nBest methods for MEN dataset by window size:")
    print(df_results.loc[best_method_men])

    print("\nBest methods for SimLex-999 dataset by window size:")
    print(df_results.loc[best_method_simlex])

    # Average performance of each method
    avg_performance = df_results.groupby('Method').mean()
    print("\nAverage performance of each method across all window sizes:")
    print(avg_performance)

# Analyze the trends
analyze_trends(df_results)


   Method  Window Size  MEN Correlation  SimLex Correlation
0  Counts            1         0.209092            0.067786
1  Counts            3         0.225140            0.058761
2  Counts            6         0.241067            0.044696
3     IDF            1         0.347589            0.189255
4     IDF            3         0.472851            0.164365
5     IDF            6         0.532401            0.110635
6     PMI            1         0.433603            0.227498
7     PMI            3         0.465632            0.186432
8     PMI            6         0.472408            0.150331

Best methods for MEN dataset by window size:
  Method  Window Size  MEN Correlation  SimLex Correlation
6    PMI            1         0.433603            0.227498
4    IDF            3         0.472851            0.164365
5    IDF            6         0.532401            0.110635

Best methods for SimLex-999 dataset by window size:
  Method  Window Size  MEN Correlation  SimLex Correlation
6    P

###5 Qualitative Analysis (25 points)

5.1

In [13]:
# Function to get the k nearest neighbors of a word
def get_nearest_neighbors(word, word_vectors, vocab, k=10):
    if word not in word_vectors:
        print(f"Word '{word}' not found in word vectors.")
        return []

    similarities = []
    for other_word in vocab:
        if other_word != word and other_word in word_vectors:
            similarity = cosine_similarity(word_vectors[word], word_vectors[other_word])
            similarities.append((other_word, similarity))

    # Sort by similarity and return the top k
    nearest_neighbors = sorted(similarities, key=lambda x: x[1], reverse=True)[:k]
    return nearest_neighbors

# Function to print the nearest neighbors for a word with two different window sizes
def compare_nearest_neighbors(word, word_vectors_w1, word_vectors_w6, vocab, k=10):
    print(f"Nearest neighbors for '{word}' (w = 1):")
    neighbors_w1 = get_nearest_neighbors(word, word_vectors_w1, vocab, k)
    for neighbor, similarity in neighbors_w1:
        print(f"{neighbor}: {similarity:.4f}")

    print(f"\nNearest neighbors for '{word}' (w = 6):")
    neighbors_w6 = get_nearest_neighbors(word, word_vectors_w6, vocab, k)
    for neighbor, similarity in neighbors_w6:
        print(f"{neighbor}: {similarity:.4f}")

# Example: Compare nearest neighbors for the word "judges"
word = "judges"
compare_nearest_neighbors(word, pmi_word_vectors_w1, pmi_word_vectors_w6, vocab_V)


Nearest neighbors for 'judges' (w = 1):
judge: 0.2225
players: 0.2115
appeals: 0.1893
officials: 0.1817
ministers: 0.1786
justices: 0.1784
leaders: 0.1729
members: 0.1729
unanimously: 0.1700
contestants: 0.1657

Nearest neighbors for 'judges' (w = 6):
judge: 0.3054
jury: 0.2880
appeals: 0.2774
courts: 0.2740
panel: 0.2740
supreme: 0.2688
justice: 0.2566
contestants: 0.2565
candidates: 0.2488
appeal: 0.2488


5.2

In [14]:
# Function to compare nearest neighbors for a list of words with two different window sizes
def compare_neighbors_by_pos(words, word_vectors_w1, word_vectors_w6, vocab, k=10):
    for word in words:
        print(f"\nComparing nearest neighbors for '{word}':")
        compare_nearest_neighbors(word, word_vectors_w1, word_vectors_w6, vocab, k)

# List of words from different parts of speech
query_words = {
    'Noun': 'judge',
    'Verb': 'run',
    'Adjective': 'happy',
    'Preposition': 'above'
}

# Example: Compare nearest neighbors for words from different parts of speech
for pos, word in query_words.items():
    print(f"\n=== Part of Speech: {pos} ===")
    compare_neighbors_by_pos([word], pmi_word_vectors_w1, pmi_word_vectors_w6, vocab_V)



=== Part of Speech: Noun ===

Comparing nearest neighbors for 'judge':
Nearest neighbors for 'judge' (w = 1):
captain: 0.2652
justice: 0.2505
lieutenant: 0.2360
smith: 0.2338
professor: 0.2287
sir: 0.2276
court: 0.2266
king: 0.2238
henry: 0.2236
george: 0.2227

Nearest neighbors for 'judge' (w = 6):
justice: 0.4014
supreme: 0.4009
attorney: 0.3780
court: 0.3560
governor: 0.3530
appeals: 0.3246
mayor: 0.3242
lawyer: 0.3181
deputy: 0.3166
secretary: 0.3158

=== Part of Speech: Verb ===

Comparing nearest neighbors for 'run':
Nearest neighbors for 'run' (w = 1):
runs: 0.2770
running: 0.2723
operate: 0.2373
pass: 0.2269
go: 0.2195
operated: 0.2184
ran: 0.2081
come: 0.2012
held: 0.1946
move: 0.1926

Nearest neighbors for 'run' (w = 6):
running: 0.3625
runs: 0.3173
ran: 0.2953
start: 0.2534
pass: 0.2519
race: 0.2436
away: 0.2394
drive: 0.2336
car: 0.2335
play: 0.2273

=== Part of Speech: Adjective ===

Comparing nearest neighbors for 'happy':
Nearest neighbors for 'happy' (w = 1):
pleased: 

**5.3**

In [15]:
import numpy as np

# Function to compute cosine similarity between two word vectors
def cosine_similarity(vec1, vec2):
    dot_product = sum(vec1[key] * vec2.get(key, 0) for key in vec1)
    norm1 = np.sqrt(sum(val ** 2 for val in vec1.values()))
    norm2 = np.sqrt(sum(val ** 2 for val in vec2.values()))
    if norm1 == 0 or norm2 == 0:
        return 0.0
    return dot_product / (norm1 * norm2)

# Function to get the k nearest neighbors of a word
def get_nearest_neighbors(word, word_vectors, vocab, k=10):
    if word not in word_vectors:
        print(f"Word '{word}' not found in word vectors.")
        return []

    similarities = []
    for other_word in vocab:
        if other_word != word and other_word in word_vectors:
            similarity = cosine_similarity(word_vectors[word], word_vectors[other_word])
            similarities.append((other_word, similarity))

    # Sort by similarity and return the top k
    nearest_neighbors = sorted(similarities, key=lambda x: x[1], reverse=True)[:k]
    return nearest_neighbors

# Function to print the nearest neighbors for a word with two different window sizes
def compare_nearest_neighbors(word, word_vectors_w1, word_vectors_w6, vocab, k=10):
    print(f"Nearest neighbors for '{word}' (w = 1):")
    neighbors_w1 = get_nearest_neighbors(word, word_vectors_w1, vocab, k)
    for neighbor, similarity in neighbors_w1:
        print(f"{neighbor}: {similarity:.4f}")

    print(f"\nNearest neighbors for '{word}' (w = 6):")
    neighbors_w6 = get_nearest_neighbors(word, word_vectors_w6, vocab, k)
    for neighbor, similarity in neighbors_w6:
        print(f"{neighbor}: {similarity:.4f}")

# Example: Compare nearest neighbors for multisense word "bank"
multisense_word = "bank"
compare_nearest_neighbors(multisense_word, pmi_word_vectors_w1, pmi_word_vectors_w6, vocab_V)


Nearest neighbors for 'bank' (w = 1):
side: 0.2065
coast: 0.2063
railway: 0.2063
park: 0.2022
africa: 0.1993
banks: 0.1965
corporation: 0.1927
property: 0.1870
railroad: 0.1847
province: 0.1836

Nearest neighbors for 'bank' (w = 6):
capital: 0.3709
corporation: 0.3707
railway: 0.3417
northern: 0.3262
branch: 0.3194
southern: 0.3179
valley: 0.3177
lake: 0.3144
banks: 0.3144
centre: 0.3120
