# Task_4_Evaluation

In [2]:
import os
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import load_npz
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import plotly.graph_objects as go

Matrices Loading

In [3]:
def load_saved_objects(folder_path="."):
    print("Loading pre-computed model files")
    try:
        ppmi_matrix = load_npz(os.path.join(folder_path, 'ppmi_matrix.npz'))
        svd_matrix = np.load(os.path.join(folder_path, 'svd_matrix.npy'))
        word2vec_embeddings = np.load(os.path.join(folder_path, 'word2vec_embeddings.npy'))
        with open(os.path.join(folder_path, 'id_to_word.pkl'), 'rb') as f:
            id_to_word = pickle.load(f)
        word_to_id = {word: i for i, word in id_to_word.items()}
        print("All files loaded successfully.")
        return ppmi_matrix, svd_matrix, word2vec_embeddings, word_to_id, id_to_word
    except FileNotFoundError as e:
        print(f"ERROR: A required file was not found: {e}")
        print("Please make sure all .npz, .npy, and .pkl files are in the same directory as this script.")
        return None

Plots for PCA

In [4]:
def find_most_similar(query_word, matrix, word_to_id, id_to_word, top_n=20):
    if query_word not in word_to_id:
        return ["Word not in vocabulary"] * top_n
    
    query_id = word_to_id[query_word]
    query_vector = matrix[query_id].reshape(1, -1)
    
    sim_scores = cosine_similarity(query_vector, matrix).flatten()
    
    top_indices = np.argsort(sim_scores)[::-1][1:top_n+1]
    
    return [id_to_word[i] for i in top_indices]

def visualize_word_neighborhood(query_word, similar_words, matrix, word_to_id, model_name):

    print(f"Generating PCA plot for '{query_word}' using {model_name}...")
    
    # Combine the query word and its similar words
    words_to_plot = [query_word] + similar_words
    word_indices = [word_to_id[w] for w in words_to_plot if w in word_to_id]
    
    # Get the corresponding vectors
    vectors = matrix[word_indices]
    if not isinstance(vectors, np.ndarray): # Handle sparse matrices
        vectors = vectors.toarray()
        
    # Apply PCA to reduce to 2D
    pca = PCA(n_components=2, random_state=42)
    vectors_2d = pca.fit_transform(vectors)
    
    # Create the plot
    fig = go.Figure()

    # Add similar words (blue points)
    fig.add_trace(go.Scatter(
        x=vectors_2d[1:, 0],
        y=vectors_2d[1:, 1],
        mode='markers+text',
        text=words_to_plot[1:],
        textposition="top center",
        marker=dict(color='blue', size=8),
        name='Similar Words'
    ))

    # Add the query word (red, larger point)
    fig.add_trace(go.Scatter(
        x=[vectors_2d[0, 0]],
        y=[vectors_2d[0, 1]],
        mode='markers+text',
        text=[query_word],
        textposition="bottom center",
        marker=dict(color='red', size=12, symbol='star'),
        name='Query Word'
    ))

    fig.update_layout(
        title=f"PCA Visualization for '{query_word}' and its Neighbors ({model_name})",
        xaxis_title="Principal Component 1",
        yaxis_title="Principal Component 2",
        showlegend=True
    )
    
    fig.show()

Main Execution

In [5]:
def main():
    """Main function to run the evaluation."""
    loaded_objects = load_saved_objects()
    if loaded_objects is None:
        return
    
    ppmi_matrix, svd_matrix, w2v_matrix, word_to_id, id_to_word = loaded_objects
    
    query_keywords = {
        'tech': 'software',
        'sport': 'game',
        'politics': 'election',
        'entertainment': 'film',
        'business': 'market'
    }
    
    results_for_table = []
    
    for topic, keyword in query_keywords.items():
        print(f"--- Processing keyword for '{topic}': '{keyword}' ---")
        
        # Find top 20 similar words for each model
        similar_vsm = find_most_similar(keyword, ppmi_matrix, word_to_id, id_to_word)
        similar_svd = find_most_similar(keyword, svd_matrix, word_to_id, id_to_word)
        similar_w2v = find_most_similar(keyword, w2v_matrix, word_to_id, id_to_word)
        
        # Store results for the table
        results_for_table.append({
            "Query Word": f"{keyword} ({topic})",
            "Top 20 Similar Words (VSM)": ", ".join(similar_vsm),
            "Top 20 Similar Words (SVD)": ", ".join(similar_svd),
            "Top 20 Similar Words (W2V Skipgram)": ", ".join(similar_w2v)
        })
        
        # Generate the PCA visualizations for each model
        visualize_word_neighborhood(keyword, similar_vsm, ppmi_matrix, word_to_id, "VSM (PPMI)")
        visualize_word_neighborhood(keyword, similar_svd, svd_matrix, word_to_id, "SVD")
        visualize_word_neighborhood(keyword, similar_w2v, w2v_matrix, word_to_id, "Word2Vec")

    # Display the final results table
    results_df = pd.DataFrame(results_for_table)
    print("\n\n FINAL SIMILARITY RESULTS TABLE ")
    print(results_df.to_string())

if __name__ == "__main__":
    main()

Loading pre-computed model files
All files loaded successfully.
--- Processing keyword for 'tech': 'software' ---
Generating PCA plot for 'software' using VSM (PPMI)...


Generating PCA plot for 'software' using SVD...


Generating PCA plot for 'software' using Word2Vec...


--- Processing keyword for 'sport': 'game' ---
Generating PCA plot for 'game' using VSM (PPMI)...


Generating PCA plot for 'game' using SVD...


Generating PCA plot for 'game' using Word2Vec...


--- Processing keyword for 'politics': 'election' ---
Generating PCA plot for 'election' using VSM (PPMI)...


Generating PCA plot for 'election' using SVD...


Generating PCA plot for 'election' using Word2Vec...


--- Processing keyword for 'entertainment': 'film' ---
Generating PCA plot for 'film' using VSM (PPMI)...


Generating PCA plot for 'film' using SVD...


Generating PCA plot for 'film' using Word2Vec...


--- Processing keyword for 'business': 'market' ---
Generating PCA plot for 'market' using VSM (PPMI)...


Generating PCA plot for 'market' using SVD...


Generating PCA plot for 'market' using Word2Vec...




 FINAL SIMILARITY RESULTS TABLE 
             Query Word                                                                                                                                                    Top 20 Similar Words (VSM)                                                                                                                                                                Top 20 Similar Words (SVD)                                                                                                                                                                        Top 20 Similar Words (W2V Skipgram)
0       software (tech)  microsoft, programs, users, antivirus, windows, program, computer, spyware, security, pcs, system, microsofts, use, internet, web, files, people, patents, tools, technology  programs, microsoft, windows, users, program, microsofts, linux, computer, pcs, antivirus, security, firewall, tools, harbouring, system, precursor, spyware, update, manually, int