In [None]:
pip install gensim

#### Q. WAP using Pre-trained Word2Vec embedding to solve the analogy ['Man':'Women'::'King':'Queen']

In [None]:
# Q. WAP using Pre-trained Word2Vec embedding to solve the analogy ['Man':'Women'::'King':'_____']

import gensim.downloader as api

# Load the pre-trained Word2Vec model (Google News vectors)
print("Loading Word2Vec model...")
model = api.load("word2vec-google-news-300")  # ~1.6GB download on first run
print("Model loaded successfully!")

# Analogy: Man : Woman :: King : ?
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

print("Analogy Result:")
print("'Man' : 'Woman' :: 'King' :", result[0][0])


#### Word2Vec Without using pre-trained model

In [2]:
# Word2Vec Without using pre-trained model
from gensim.models import Word2Vec

# 1. Sample training corpus (you can expand this)
corpus = [
    ['man', 'woman', 'king', 'queen', 'boy', 'girl'],
    ['man', 'is', 'to', 'woman'],
    ['king', 'is', 'to', 'queen'],
    ['prince', 'and', 'princess', 'are', 'royal'],
    ['uncle', 'is', 'to', 'aunt'],
    ['father', 'and', 'mother', 'are', 'parents'],
    ['brother', 'sister', 'siblings'],
    ['husband', 'wife', 'married']
]

# 2. Train a Word2Vec model on the above corpus
model = Word2Vec(sentences=corpus, vector_size=100, window=2, min_count=1, sg=1)

# 3. Perform the analogy: man : woman :: king : ?
result = model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

# 4. Display the result
print("'man' : 'woman' :: 'king' :", result[0][0])


'man' : 'woman' :: 'king' : princess


#### Train a Word2Vec model on a given dataset and display the most similar words to a given word

In [4]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

# Download tokenizer resources
nltk.download('punkt')

# Example dataset (you can replace this with your own text data)
corpus = [
    "Heart disease is one of the leading causes of death.",
    "Exercise and a healthy diet can reduce the risk of heart problems.",
    "Blood pressure and cholesterol are important health indicators.",
    "Early detection of heart disease can save lives.",
    "Lifestyle changes such as quitting smoking can help prevent heart disease."
]

# Tokenize the sentences into words
tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in corpus]

# Train the Word2Vec model
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Save model (optional)
model.save("word2vec_heart.model")

# Load model (optional)
# model = Word2Vec.load("word2vec_heart.model")

# Find most similar words to a given word
word = "heart"
if word in model.wv:
    print(f"Most similar words to '{word}':")
    similar_words = model.wv.most_similar(word, topn=5)
    for w, score in similar_words:
        print(f"{w}: {score:.4f}")
else:
    print(f"'{word}' not found in vocabulary.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Most similar words to 'heart':
lifestyle: 0.1889
smoking: 0.1886
causes: 0.1611
risk: 0.1599
help: 0.1374


#### Q. Write a function that encodes an input using an autoencoder and then reconstructs it.

In [None]:
"""
    Encodes the input using the encoder of the autoencoder, then reconstructs it using the decoder.

    Parameters:
    - autoencoder: a trained autoencoder model with encoder and decoder attributes.
    - input_tensor: a torch tensor representing the input (e.g., image or feature vector).

    Returns:
    - encoded: the latent representation.
    - reconstructed: the reconstructed input.
    """

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# Step 1: Input training sentences
sentences = [
    "Machine learning is amazing.",
    "Natural language processing is a part of machine learning.",
    "TF-IDF is used in text analysis."
]

# Step 2: Create and fit TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(sentences)
input_data = tfidf_matrix.toarray()
input_dim = input_data.shape[1]

# Step 3: Build and train the autoencoder
def build_autoencoder(input_dim):
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(32, activation='relu')(input_layer)
    decoded = Dense(input_dim, activation='sigmoid')(encoded)
    autoencoder = Model(inputs=input_layer, outputs=decoded)
    autoencoder.compile(optimizer='adam', loss='mse')
    encoder = Model(inputs=input_layer, outputs=encoded)
    return autoencoder, encoder

def encode_and_reconstruct(autoencoder, encoder, data):
    encoded_data = encoder.predict(data)
    reconstructed_data = autoencoder.predict(data)
    return encoded_data, reconstructed_data

autoencoder, encoder = build_autoencoder(input_dim)
autoencoder.fit(input_data, input_data, epochs=50, verbose=0)

# Step 4: Encode and reconstruct original data
encoded_data, reconstructed_data = encode_and_reconstruct(autoencoder, encoder, input_data)

# Step 5: Take a new input sentence
new_sentence = ["Machine learning and text analysis are related."]
new_tfidf = vectorizer.transform(new_sentence).toarray()  # Use same vectorizer

# Step 6: Encode and reconstruct the new sentence
new_encoded, new_reconstructed = encode_and_reconstruct(autoencoder, encoder, new_tfidf)

print("\nNew Sentence:")
print(new_sentence[0])
print("\nEncoded Representation:")
print(new_encoded[0])
print("\nReconstructed TF-IDF Vector:")
print(new_reconstructed[0])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step

New Sentence:
Machine learning and text analysis are related.

Encoded Representation:
[0.26930112 0.         0.02905112 0.12192447 0.         0.31787345
 0.19716169 0.         0.         0.14121023 0.         0.
 0.02682747 0.         0.38624427 0.         0.01065023 0.2929125
 0.         0.         0.26583052 0.         0.15532437 0.
 0.4641149  0.33174947 0.1414612  0.1612137  0.2896174  0.21501599
 0.19792387 0.        ]

Reconstructed TF-IDF Vector:
[0.32993898 0.3596124  0.4444794  0.47059548 0.46390933 0.41327438
 0.34619245 0.51106167 0.4292871  0.3561594  0.4550676  0.42527857
 0.42811215 0.43288884 0.43990406]


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Step 1: Input sentences
sentences = [
    "Machine learning is amazing.",
    "Natural language processing is a part of machine learning.",
    "TF-IDF is used in text analysis."
]

# Step 2: Create a TfidfVectorizer instance
vectorizer = TfidfVectorizer()

# Step 3: Fit and transform the sentences into TF-IDF vectors
tfidf_matrix = vectorizer.fit_transform(sentences)

# Step 4: Convert to a DataFrame for readability
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Step 5: Display the TF-IDF DataFrame
print("TF-IDF Matrix:")
print(tfidf_df)


TF-IDF Matrix:
    amazing  analysis       idf        in        is  language  learning  \
0  0.631745  0.000000  0.000000  0.000000  0.373119  0.000000  0.480458   
1  0.000000  0.000000  0.000000  0.000000  0.231559  0.392063  0.298174   
2  0.000000  0.396875  0.396875  0.396875  0.234400  0.000000  0.000000   

    machine   natural        of      part  processing      text        tf  \
0  0.480458  0.000000  0.000000  0.000000    0.000000  0.000000  0.000000   
1  0.298174  0.392063  0.392063  0.392063    0.392063  0.000000  0.000000   
2  0.000000  0.000000  0.000000  0.000000    0.000000  0.396875  0.396875   

       used  
0  0.000000  
1  0.000000  
2  0.396875  
