In [1]:
pip install numpy pandas scikit-learn gensim


Collecting numpyNote: you may need to restart the kernel to use updated packages.

  Downloading numpy-2.2.5-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.8 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.8 kB ? eta -:--:--
     ------------ ------------------------- 20.5/60.8 kB 131.3 kB/s eta 0:00:01
     ------------ ------------------------- 20.5/60.8 kB 131.3 kB/s eta 0:00:01
     ------------ ------------------------- 20.5/60.8 kB 131.3 kB/s eta 0:00:01
     ------------ ------------------------- 20.5/60.8 kB 131.3 kB/s eta 0:00:01
     ------------ ------------------------- 20.5/60.8 kB 131.3 kB/s eta 0:00:01
     ------------ ------------------------- 20.5/60.8 kB 131.3 kB/s eta 0:00:01
     ------------------------------- ------ 51.2/60.8 kB 100.9 kB/s eta 0:00:01
     -------------------------------------- 60.8/60.8 kB 119.9 kB/s et


[notice] A new release of pip is available: 24.0 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

# Function to get user input
def get_user_input():
    print("Enter your sentences (type 'exit' to finish):")
    documents = []
    while True:
        sentence = input()
        if sentence.lower() == 'exit':
            break
        documents.append(sentence)
    return documents

# Get user input
documents = get_user_input()

# Check if any documents were entered
if not documents:
    print("No documents were entered. Exiting.")
    exit()

# Bag-of-Words Approach
# Count Occurrence
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(documents)
count_array = count_matrix.toarray()
count_df = pd.DataFrame(count_array, columns=count_vectorizer.get_feature_names_out())
print("\nCount Occurrence:\n", count_df)

# Normalized Count Occurrence
normalized_count_df = count_df.div(count_df.sum(axis=1), axis=0)
print("\nNormalized Count Occurrence:\n", normalized_count_df)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
tfidf_array = tfidf_matrix.toarray()
tfidf_df = pd.DataFrame(tfidf_array, columns=tfidf_vectorizer.get_feature_names_out())
print("\nTF-IDF:\n", tfidf_df)

# Word2Vec
# Preprocessing the documents into a list of lists (tokenized)
tokenized_docs = [doc.lower().split() for doc in documents]

# Create Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=1, workers=4)

# Example: Get the vector for a specific word
word_to_check = input("\nEnter a word to get its vector: ")
if word_to_check in word2vec_model.wv:
    word_vector = word2vec_model.wv[word_to_check]
    print(f"\nWord2Vec - Vector for '{word_to_check}':\n", word_vector)
else:
    print(f"The word '{word_to_check}' is not in the vocabulary.")

# Example: Find similar words
similar_words = word2vec_model.wv.most_similar(word_to_check, topn=5) if word_to_check in word2vec_model.wv else []
if similar_words:
    print(f"\nWords similar to '{word_to_check}':\n", similar_words)

Enter your sentences (type 'exit' to finish):

Count Occurrence:
    are  lookiing  lov  nice  you
0    0         0    1     0    1
1    1         1    0     1    0

Normalized Count Occurrence:
         are  lookiing  lov      nice  you
0  0.000000  0.000000  0.5  0.000000  0.5
1  0.333333  0.333333  0.0  0.333333  0.0

TF-IDF:
        are  lookiing       lov     nice       you
0  0.00000   0.00000  0.707107  0.00000  0.707107
1  0.57735   0.57735  0.000000  0.57735  0.000000
The word '' is not in the vocabulary.
