In [None]:
#How a word stores as Vector

import spacy

# Load a pre-trained model with word embeddings
nlp = spacy.load("en_core_web_md")

# Get the vector representation for a word
vector = nlp("king").vector

# Print the vector representation
print(vector)


In [None]:
#Count Based Models
from sklearn.feature_extraction.text import CountVectorizer

# Define a corpus of text
corpus = [
    "I am doing my work with apple",
    "I ate an apple"
]

# Initialize a count vectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer to the corpus and transform the data
X = vectorizer.fit_transform(corpus)

# Print the feature names and the transformed data
print(vectorizer.get_feature_names())
print(X.toarray())


In [None]:
#Prediction Based Model - Word2Vec
import gensim
from gensim.models import Word2Vec

# Define a corpus of text
corpus = [
    ["I", "am", "doing", "my", "work", "with", "apple"],
    ["I", "ate", "an", "apple"]
]

# Train a Word2Vec model
model = Word2Vec(corpus, size=100, window=5, min_count=1, workers=4)

# Get the vector representation for a word
vector = model.wv["apple"]

# Print the vector representation
print(vector)


In [None]:
#Model with and without Word Embedding
from sklearn.metrics import classification_report
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

# Load the IMDB dataset
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=vocab_size)

# Pad the sequences to a fixed length
train_data = pad_sequences(train_data, maxlen=max_length)
test_data = pad_sequences(test_data, maxlen=max_length)

# Train a classification model without embeddings
model_without_embeddings = Sequential([
    Flatten(input_shape=(max_length,)),
    Dense(units=1, activation="sigmoid")
])
model_without_embeddings.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model_without_embeddings.fit(train_data, train_labels, epochs=num_epochs, batch_size=batch_size, validation_split=0.2)

# Train a classification model with embeddings
model_with_embeddings = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=max_length),
    Flatten(),
    Dense(units=1, activation="sigmoid")
])
model_with_embeddings.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model_with_embeddings.fit(train_data, train_labels, epochs=num_epochs, batch_size=batch_size, validation_split=0.2)

# Evaluate the models
print("Model without embeddings:")
print(classification_report(test_labels, model_without_embeddings.predict_classes(test_data)))

print("Model with embeddings:")
print(classification_report(test_labels, model_with_embeddings.predict_classes(test_data)))


In [None]:
#CBOW Implementation
from keras.models import Sequential
from keras.layers import Embedding, Dense, Lambda
from keras.utils import np_utils
import numpy as np

# Generate some example data
texts = ["I am doing my work with apple and ate an apple"]
words = [t.lower().split() for t in texts]
vocab = sorted(list(set([word for sentence in words for word in sentence])))

# Create word to index and index to word mappings
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for i, w in enumerate(vocab)}

# Set up the CBOW model
window_size = 2
vector_size = 100
model = Sequential()
model.add(Embedding(input_dim=len(vocab), output_dim=vector_size, input_length=window_size*2))
model.add(Lambda(lambda x: np.mean(x, axis=1), output_shape=(vector_size,)))
model.add(Dense(len(vocab), activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Generate training data
X_train = []
y_train = []
for sentence in words:
    for i, word in enumerate(sentence):
        context_words = sentence[max(0, i-window_size):i] + sentence[i+1:i+window_size+1]
        target_word = np_utils.to_categorical(word2idx[word], len(vocab))
        context_idxs = [word2idx[w] for w in context_words]
        X_train.append(context_idxs)
        y_train.append(target_word)

# Train the model
model.fit(np.array(X_train), np.array(y_train), epochs=100, verbose=0)

# Get the learned embeddings
embeddings = model.get_weights()[0]

# Print the embeddings for the word 'apple'
print(embeddings[word2idx['apple']])



In [None]:
#Word2Vec CBOW Implementation
from gensim.models import Word2Vec

# Define a list of sentences to train the model
sentences = [["I", "am", "doing", "my", "work", "with", "apple", "and", "ate", "an", "apple"]]

# Set the window size and vector dimension
window_size = 2
vector_size = 100

# Train the Word2Vec CBOW model
model = Word2Vec(sentences, size=vector_size, window=window_size, min_count=1, workers=4, sg=0)

# Get the learned vector representation for the word 'apple'
print(model.wv['apple'])



In [None]:
#FastText
from gensim.models import FastText

# Load a corpus of text data
corpus = [["the", "cat", "is", "on", "the", "mat"], 
          ["the", "dog", "is", "in", "the", "yard"],
          ["the", "bird", "is", "flying", "in", "the", "sky"]]

# Train a FastText model on the corpus
model = FastText(corpus, size=100, window=5, min_count=1, workers=4)

# Get the embedding vector for a word
vector = model.wv['cat']
print(vector)
