In [2]:
import gensim.downloader as api
from gensim.models import KeyedVectors

# --- Step 1: Load the pre-trained Word2Vec model ---
# This will download the model (approx. 1.6 GB) on the first run.
# Subsequent runs will load it from the local cache.
print("Loading the 'word2vec-google-news-300' model... (This may take a while)")
try:
    model = api.load('word2vec-google-news-300')
    print("Model loaded successfully!")
except Exception as e:
    print(f"Failed to load model. Error: {e}")
    print("Please check your internet connection or try again later.")
    exit()

# The loaded model contains word vectors, but not the full training model.
# For our purposes, this is exactly what we need.

print("\n" + "="*50)
print("Part 1: Finding Similar Words")
print("="*50)

# --- Step 2: Pick 5 words and find their most similar words ---
words_to_check = ['galaxy', 'keyboard', 'music', 'running', 'happy']

for word in words_to_check:
    print(f"\n--- Words most similar to '{word}' ---")
    try:
        # The most_similar() function finds the top-N most similar words.
        # It returns a list of tuples, where each tuple is (word, similarity_score).
        similar_words = model.most_similar(word, topn=5)
        for similar_word, score in similar_words:
            print(f"{similar_word:<20} Similarity: {score:.4f}")
    except KeyError:
        # This happens if the word is not in the model's vocabulary.
        print(f"Sorry, the word '{word}' is not in the vocabulary.")

print("\n" + "="*50)
print("Part 2: Word Analogy Experiments")
print("="*50)

# --- Step 3: Perform word analogy tests ---
# The logic is: vector(A) - vector(B) + vector(C) should be close to vector(D)
# where the relationship is "B is to A as C is to D".
# The most_similar function handles this with `positive` and `negative` parameters.
# D = most_similar(positive=[A, C], negative=[B])

# Analogy 1: The classic "King - Man + Woman"
print("\n--- Analogy 1: king - man + woman ~= ? ---")
# Expected result: queen
try:
    result = model.most_similar(positive=['king', 'woman'], negative=['man'], topn=1)
    print(f"Result: {result[0][0]} (Similarity: {result[0][1]:.4f})")
    print("Observation: The model correctly identifies 'queen' as the answer.")
except KeyError as e:
    print(f"A word in the analogy was not found in the vocabulary: {e}")

# Analogy 2: Country-Capital Relationship "France - Paris + Berlin"
# This is a bit backward. The correct analogy is "Paris is to France as Berlin is to ?"
# So, Germany = France - Paris + Berlin
print("\n--- Analogy 2: France - Paris + Berlin ~= ? ---")
# Expected result: Germany
try:
    # We are asking: "What country is Berlin the capital of, in the same way Paris is the capital of France?"
    result = model.most_similar(positive=['France', 'Berlin'], negative=['Paris'], topn=1)
    print(f"Result: {result[0][0]} (Similarity: {result[0][1]:.4f})")
    print("Observation: The model successfully finds 'Germany', capturing the country-capital relationship.")
except KeyError as e:
    print(f"A word in the analogy was not found in the vocabulary: {e}")

# Analogy 3: Verb Tense Relationship "walking - walk + swim"
# Analogy: "walk is to walking as swim is to ?"
print("\n--- Analogy 3: walking - walk + swim ~= ? ---")
# Expected result: swimming
try:
    # We are asking: "What is the present participle of 'swim', in the same way 'walking' is for 'walk'?"
    result = model.most_similar(positive=['walking', 'swim'], negative=['walk'], topn=1)
    print(f"Result: {result[0][0]} (Similarity: {result[0][1]:.4f})")
    print("Observation: The model understands grammatical relationships, correctly identifying 'swimming'.")
except KeyError as e:
    print(f"A word in the analogy was not found in the vocabulary: {e}")

Loading the 'word2vec-google-news-300' model... (This may take a while)
Model loaded successfully!

Part 1: Finding Similar Words

--- Words most similar to 'galaxy' ---
galaxies             Similarity: 0.7880
Milky_Way_galaxy     Similarity: 0.7715
Milky_Way_Galaxy     Similarity: 0.7398
Milky_Way            Similarity: 0.7311
distant_galaxy       Similarity: 0.7264

--- Words most similar to 'keyboard' ---
keyboards            Similarity: 0.7883
Keyboard             Similarity: 0.7165
touchpad             Similarity: 0.7083
trackpad             Similarity: 0.7040
keypad               Similarity: 0.6944

--- Words most similar to 'music' ---
classical_music      Similarity: 0.7198
jazz                 Similarity: 0.6835
Music                Similarity: 0.6596
Without_Donny_Kirshner Similarity: 0.6416
songs                Similarity: 0.6396

--- Words most similar to 'running' ---
Running              Similarity: 0.6979
ran                  Similarity: 0.6085
run                  Simil

In [1]:
!pip install gensim


