In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import warnings
import logging
warnings.filterwarnings('ignore')

from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import string
from sklearn.manifold import TSNE
import tqdm
import nltk
from nltk.tokenize import sent_tokenize
from gensim.models import word2vec
from gensim.models.word2vec import Word2Vec

In [2]:
df = pd.read_csv('pre_processed_data.csv')

In [3]:
df['Book_title'].describe(include='all') # i think we haves some duplicates

count                             43055
unique                            35443
top       Neural Information Processing
freq                                 71
Name: Book_title, dtype: object

In [4]:
corpus = df['processed_TSD'].tolist()

In [5]:
tokens = [doc.split() for doc in corpus]

In [6]:
# training the model
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = Word2Vec(sentences=tokens,
                 window = 10,
                 vector_size=150,
                 min_count=3,
                epochs=50)

2024-04-12 16:26:05,150 : INFO : collecting all words and their counts
2024-04-12 16:26:05,153 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-04-12 16:26:05,206 : INFO : PROGRESS: at sentence #10000, processed 320611 words, keeping 17658 word types
2024-04-12 16:26:05,256 : INFO : PROGRESS: at sentence #20000, processed 610484 words, keeping 28542 word types
2024-04-12 16:26:05,318 : INFO : PROGRESS: at sentence #30000, processed 927250 words, keeping 39541 word types
2024-04-12 16:26:05,393 : INFO : PROGRESS: at sentence #40000, processed 1251312 words, keeping 52677 word types
2024-04-12 16:26:05,417 : INFO : collected 60463 word types from a corpus of 1363742 raw words and 43055 sentences
2024-04-12 16:26:05,418 : INFO : Creating a fresh vocabulary
2024-04-12 16:26:05,501 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=3 retains 21996 unique words (36.38% of original 60463, drops 38467)', 'datetime': '2024-04-12T16:26:05.501596', 'gens

2024-04-12 16:26:34,174 : INFO : EPOCH 26: training on 1363742 raw words (1238975 effective words) took 1.0s, 1282665 effective words/s
2024-04-12 16:26:35,115 : INFO : EPOCH 27: training on 1363742 raw words (1238915 effective words) took 0.9s, 1323032 effective words/s
2024-04-12 16:26:36,070 : INFO : EPOCH 28: training on 1363742 raw words (1239155 effective words) took 1.0s, 1303221 effective words/s
2024-04-12 16:26:37,035 : INFO : EPOCH 29: training on 1363742 raw words (1239125 effective words) took 1.0s, 1289551 effective words/s
2024-04-12 16:26:38,014 : INFO : EPOCH 30: training on 1363742 raw words (1239072 effective words) took 1.0s, 1270509 effective words/s
2024-04-12 16:26:38,959 : INFO : EPOCH 31: training on 1363742 raw words (1239126 effective words) took 0.9s, 1317523 effective words/s
2024-04-12 16:26:39,948 : INFO : EPOCH 32: training on 1363742 raw words (1238676 effective words) took 1.0s, 1257539 effective words/s
2024-04-12 16:26:40,914 : INFO : EPOCH 33: train

In [7]:
# testing the vectorization
model.wv['python']

array([-1.0463874e+00,  2.3745739e-01,  1.1524779e+00, -3.2335134e+00,
        1.8317308e+00,  1.8986974e+00, -2.6661327e-01, -5.6392711e-01,
       -3.5114014e+00, -1.4658606e+00, -4.9498877e-01,  5.9730238e-01,
        1.4803756e+00,  8.3106005e-01,  1.2430992e+00, -1.0062078e+00,
        2.3347123e+00, -1.6758838e+00,  7.1546173e-01,  1.6513917e+00,
        3.5663350e+00, -3.0902916e-01, -6.7405634e+00, -3.3622959e-01,
       -3.5857004e-01,  2.2856688e+00, -2.7633035e-03, -1.8508492e+00,
        1.7088227e+00,  2.6661983e-01,  7.3444605e-01,  1.7038507e+00,
       -1.3743967e+00,  7.9821593e-01, -2.8037221e+00, -1.0213603e+00,
        1.5203537e-01,  1.2047384e+00, -1.3095447e+00,  2.7986652e-01,
        1.3065265e-01, -2.3555694e+00, -3.3793046e+00, -1.0187489e+00,
       -1.3476597e+00,  1.3745914e+00, -2.9639947e+00,  4.6438134e-01,
        5.1639595e+00,  3.6093204e+00,  7.0380390e-01, -6.6726321e-01,
        1.9238764e-01, -9.1863817e-01,  4.4579484e-02, -1.2994635e+00,
      

In [8]:
model.wv.most_similar('python') # Checking the most similar words

[('panda', 0.43121689558029175),
 ('handson', 0.4162122905254364),
 ('numpy', 0.3979624807834625),
 ('r', 0.39123639464378357),
 ('chatgpt', 0.39102572202682495),
 ('pythonbased', 0.38491642475128174),
 ('linux', 0.3808121085166931),
 ('basic', 0.3770609498023987),
 ('jupyter', 0.37458059191703796),
 ('learn', 0.37349462509155273)]

In [9]:
# list of keys (words) that have been vectorized and stored in the model
model.wv.index_to_key 

['unk',
 'book',
 'international',
 'conference',
 'proceeding',
 'th',
 'application',
 'guide',
 'system',
 'paper',
 'held',
 's',
 'data',
 'new',
 'constitutes',
 'life',
 'programming',
 'learning',
 'development',
 'revised',
 'network',
 'intelligence',
 'using',
 'refereed',
 'business',
 'world',
 'technology',
 'design',
 'full',
 'computer',
 'computing',
 'practical',
 'management',
 'part',
 'workshop',
 'learn',
 'nbsp',
 'artificial',
 'one',
 'language',
 'information',
 'selected',
 'presented',
 'edition',
 'work',
 'technique',
 'time',
 'volume',
 'web',
 'need',
 'first',
 'research',
 'practice',
 'september',
 'game',
 'approach',
 'c',
 'use',
 'science',
 'machine',
 'year',
 'way',
 'theory',
 'author',
 'provides',
 'help',
 'build',
 'power',
 'tool',
 'set',
 'analysis',
 'python',
 'knowledge',
 'software',
 'database',
 'de',
 'algorithm',
 'many',
 'people',
 'java',
 'make',
 'present',
 'get',
 'introduction',
 'experience',
 'july',
 'ai',
 'method',

In [10]:
# creating a dictionary to represent word and vector in K:v format
word_embeddings = {}

for word in model.wv.index_to_key:
    vector = model.wv.get_vector(word)
    word_embeddings[word] = vector

In [11]:
# dictionary of word->vector format
# search this dictionary if you want to inspect any words ---> remove this comment later.
word_embeddings

{'unk': array([-0.84848166,  2.156147  ,  0.52235454,  3.8403041 , -1.1397705 ,
        -0.8691388 ,  1.7337394 , -1.3517392 , -1.8937314 ,  2.532183  ,
        -0.28390786,  1.6236775 ,  0.04818699, -0.48920718, -0.23926413,
        -0.6692589 ,  1.5104338 , -0.1391903 , -0.66974497,  0.69472325,
         0.36247048,  1.3683316 , -2.6643777 , -1.4229707 ,  0.6086361 ,
         0.30263793,  0.2707731 ,  0.15204875, -0.46857455,  0.47297707,
        -0.3860763 ,  0.63365483, -1.2014909 , -3.1203806 ,  1.5818788 ,
        -2.0543697 ,  3.2447646 , -2.85209   ,  1.1081257 ,  0.71245164,
        -2.1663897 , -2.192301  , -1.161882  ,  1.7706459 , -0.6669843 ,
        -0.7841027 , -1.8403205 , -0.04851915,  0.30677783,  1.7072142 ,
        -2.33893   , -0.7182423 , -0.37567165, -0.6488669 , -0.46784264,
        -0.42780253,  0.93891215, -1.9702221 , -0.7478416 , -0.32571036,
         0.85910606, -0.15737486, -0.92751867, -0.84432995, -0.34428984,
         1.0654114 ,  0.37741143, -0.1932844

key points:
- we notice  words like 's'etc. in the embeddings, this occurence happens as the tokenization might recognise 's' as a token itself. in cases like "it's", when we tokenize this, we can see the tokens formed are 'it' and 's'. hence we notice that here
- for words that are in contracted format, we see the tokenization creating 2 seperate tokens. generally we see the contraction words to have not more than 2 words after the apostrophe symbol. 

In [12]:
# inconsistent words that are vectorized in the corpus
[word for word,freq in word_embeddings.items() if len(word)<=2]

['th',
 's',
 'c',
 'de',
 'ai',
 'u',
 'la',
 'gt',
 'lt',
 'ii',
 'io',
 'nt',
 'e',
 'go',
 'd',
 'st',
 'el',
 'll',
 'x',
 'uk',
 'en',
 're',
 'da',
 'le',
 'r',
 'nd',
 'o',
 'fr',
 'rd',
 'p',
 'us',
 'g',
 'wg',
 'un',
 'se',
 'tc',
 'ca',
 'dr',
 've',
 'bi',
 'al',
 'v',
 'zu',
 'su',
 'em',
 'pi',
 'w',
 'j',
 'im',
 'db',
 'z',
 'ee',
 'ip',
 'et',
 'di',
 'f',
 'pc',
 'b',
 'iv',
 'l',
 'cs',
 'ad',
 'il',
 'm',
 'ml',
 'tv',
 'fl',
 'ui',
 'um',
 'a',
 'ci',
 'no',
 'si',
 'nv',
 'au',
 'lo',
 'ux',
 'ms',
 'hr',
 'vb',
 'ki',
 'na',
 'vi',
 'bc',
 'qu',
 'te',
 'n',
 'sa',
 'az',
 'h',
 'i',
 'tu',
 'k',
 'ie',
 'br',
 'cc',
 'in',
 'ce',
 'ny',
 'md',
 'es',
 'qt',
 'pa',
 'du',
 'xp',
 'pr',
 'mo',
 'ed',
 'ha',
 'er',
 'wo',
 'eg',
 'eu',
 'ab',
 'xi',
 'ac',
 'gi',
 'vr',
 'tx',
 'dc',
 'ia',
 'hi',
 'do',
 'mi',
 'pp',
 'so',
 'ic',
 'ar',
 'ss',
 'ho',
 'mr',
 'ao',
 'y',
 'bb',
 'qc',
 'vs',
 'wa',
 'it',
 'ou',
 'dy',
 'ro',
 't',
 'ga',
 'ob',
 'co',
 'os',
 'm

>When properly investigated, we see a quite few words that are insignificant  letters but are considered as tokens, this can be because of the words in the corpus, for now we consider keeping these words as these exceed the min_count although doesnt convey any crucial meaning. Training the model on these words wont really be a loss.

In [48]:
# saving the word_2_vec model:
model.save('word2vec.model')

2024-04-12 16:54:41,068 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'word2vec.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2024-04-12T16:54:41.068870', 'gensim': '4.3.2', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'saving'}
2024-04-12 16:54:41,069 : INFO : not storing attribute cum_table
2024-04-12 16:54:41,108 : INFO : saved word2vec.model


In [13]:
# reshape used here

In [14]:
# let us compute the average of each recordes vector embedding to reduce the in accuracy in dimensionality per title using ht pretrained model
def compute_vect_avg_per_title(title):
    """
    title: str-> converts title to vector form using word2vec model
    """
    words = title.split()
    
    words = [word for word in words if word in model.wv.index_to_key]
    if not words:
        return np.zeros(150) 
    
    embeddings = [model.wv[word] for word in words]
    reduced_embedding = np.mean(embeddings, axis=0) # along columns
    
    return reduced_embedding# 1 row 150 cols

In [15]:
# testing
text = 'Software Engineering at Google Whatsapp'.lower() # word whatsapp is not in the word embeddings
vector_of_text = compute_vect_avg_per_title(text)
print(vector_of_text)
print(vector_of_text.shape)

[ 1.370818   -1.0694833  -1.0964926  -0.25689042 -0.9344121   0.664464
 -1.2221107  -0.84986365  0.46787512 -0.9254391  -0.4968486  -0.6944332
 -0.03208803 -0.38604578  0.72039104 -0.97543854  0.17758584 -0.6510717
  0.35143632  0.44471973  1.4220173   0.15618405 -1.2577767   0.07483812
  1.5140605  -0.28588712  0.840691    0.6038134  -1.1392772   0.3212682
 -0.18606186 -0.68729573  0.5511356   0.8037094   0.519916   -0.31383014
 -0.02471064 -0.01198322 -0.8928293   1.3433948  -0.81327623 -0.58396363
  0.22461534 -0.28625423 -0.6095779   0.05214882  0.86543405  1.8304185
 -0.12162143  0.8161264   1.0888351  -1.3377167  -0.8084633   0.2925049
  0.17323208 -1.5121115   1.8669765   1.0641829   0.8924782  -0.7277639
  0.7653433  -0.08432409 -0.2999396   0.11350508 -0.32218984  0.9651536
  0.20026053 -0.01927257  0.5469075   0.34006298 -0.08537775 -1.2293897
  0.54912424 -0.73939157 -0.14121383  0.23704648  0.38959038 -0.75292623
 -0.01316164  0.1071011  -1.1060356   0.9025285  -0.06530676 

In [16]:
df['vectorized_TSD'] = df['processed_TSD'].apply(compute_vect_avg_per_title)

In [17]:
df.head()

Unnamed: 0,Book_title,sub_title_book,description,merged_TSD,processed_TSD,vectorized_TSD
0,Software Engineering at Google,Lessons Learned from Programming Over Time,"Today, software engineers need to know not onl...",Software Engineering at Google Lessons Learned...,software engineering google lesson learned pro...,"[0.1280229, -0.07251548, -0.31772745, -0.14148..."
1,The Software Architect Elevator,Redefining the Architect's Role in the Digital...,As the digital economy changes the rules of th...,The Software Architect Elevator Redefining the...,software architect elevator redefining archite...,"[0.8097789, 0.26672482, 0.12788647, 0.32553503..."
2,Software Architecture Metrics,UNK,Software architecture metrics are key to the m...,Software Architecture Metrics UNK Software arc...,software architecture metric unk software arch...,"[0.6919463, -0.0041020163, -0.11713455, 0.4442..."
3,Designing Hexagonal Architecture with Java,Build maintainable and long-lasting applicatio...,UNK,Designing Hexagonal Architecture with Java Bui...,designing hexagonal architecture java build ma...,"[-0.00616907, 0.57498354, -0.23328787, 0.77813..."
4,OpenStack for Architects,Design production-ready private cloud infrastr...,UNK,OpenStack for Architects Design production-rea...,openstack architect design productionready pri...,"[1.6550393, -0.08286077, 0.35140345, 0.9059244..."


In [18]:
def visualize_embeddings(model, words):
    tsne = TSNE(n_components=2, random_state=0, perplexity=len(words)-1)
    embedding_vectors = np.array([model[word] for word in words])
    two_d_embeddings = tsne.fit_transform(embedding_vectors)

    df_viz = pd.DataFrame({
        'word': words,
        'x': two_d_embeddings[:, 0],
        'y': two_d_embeddings[:, 1]
    })

    fig = px.scatter(df_viz, x='x', y='y', text='word', title='Word Embeddings Visualization')
    fig.update_traces(textposition='bottom right')
    fig.show()


In [19]:
# visually checking for the correctness of the word embeddings
words = model.wv.index_to_key
visualize_embeddings(model.wv, words[:150]) # visualizing 150 words only, you know why only 150 XD

In [20]:
# df.to_csv('Vectorized.csv', index_label=False)

# Baseline Model

In [21]:
# remove this block of code as this will already exist when we merge all the files

# defining functions for each task

def lower_case(text): 
    return text.lower()

def remove_punctuations(text):
    no_punc_text = text.translate(str.maketrans('','',string.punctuation))
    return no_punc_text

def tokenize(text):
    return word_tokenize(text)

def remove_stopwords(text):
    stopset = set(stopwords.words('english'))
    return [word for word in text if word not in stopset]

def lemmatize_text(tokens):
    lemmer = WordNetLemmatizer()
    return [lemmer.lemmatize(word) if lemmer.lemmatize(word) is not None else word for word in tokens]

def remove_special_characters(tokens):
    return [re.sub('[^A-Za-z]+', '', word) for word in tokens]

In [23]:
# function to preprocess and convert the user prompt to vector
def get_vec(user_prompt):
    
    """
    This cleans and converts the user prompts to its vector representation, each word in the prompt is searched for in the word2vec model. If the word is present,
    its respective vector is extracted. and if the word is not present, we use an array of zeros and the resultant vector is the average of the collection of the vectors
    to have an ever representation of the vectors.
    """
    user_prompt = lower_case(user_prompt) # lower the text
    clean_text = remove_punctuations(user_prompt) # clean the text 
    tokens = tokenize(clean_text) # tokenization
    clean_tokens = remove_stopwords(tokens) # stopwords removal
    lemmas = lemmatize_text(clean_tokens) # lemmatization 
    final = remove_special_characters(lemmas) # removing special characters
    words = [word for word in final if word in model.wv.index_to_key] # filtering words that are in the model, rest are stored as zeros with same dim
    if not words:
        return np.zeros(150) 
    
    embeddings = [model.wv[word] for word in words]
    reduced_embedding = np.mean(embeddings, axis=0) # along columns
    
    return reduced_embedding # 1 row 150 cols

# Base Model- Nearest Neighbours

In [24]:
from sklearn.neighbors import NearestNeighbors

>NearestNeighbors from scikit-learn is used to find the nearest neighbors of a given data point in an unsupervised manner.

In [25]:
nn_model = NearestNeighbors(n_neighbors=10, metric='cosine')

In [26]:
X = []
for array in df['vectorized_TSD']:
    X.append(array)

In [27]:
# training the model 
nn_model.fit(X)

NearestNeighbors(metric='cosine', n_neighbors=10)

In [53]:
def recommend_books(user_prompt, model, df):
    #book_titles,#words, #threshold -> commenting out, these are parameters to consider too, experimenting

    """
    Takes the user prompt, converts it to vector and fits it to the model
    
    Params: 
    User_prompt: str -> the text user enters
    nn_model: model used for prediction of user prompt
    word_list = list -> list of words that are used in model= word2vec
    recommended_words-> list: list of words that are most similar to the 
    #book_titles:list-> list of book titles
    threshold-> accepted threshold for titles (used for getting the titles which has most words from the recommended words which are above the threshold)
    df -> To retrieve the titles recommended by the model
    """    
    # converting the user prompt to a vector
    user_prompt_embedding = get_vec(user_prompt).reshape(1,150) # here reshape is required!

#     threshold = threshold

    # nearest neighbours- feeding the vectored prompt
    distances, indices = model.kneighbors(user_prompt_embedding)
    

    # Retrieve the book titles that corresponding to the user prompt as per nearest neigbour (using cosine similarity)
    recommended_titles = [df.Book_title.iloc[i] for i in indices[0]]
        
#     filtered_titles = []
    
#     for title in book_titles:
#         title_words = title.split()
        
#         common_words = sum(1 for word in title_words if word in recommended_words)
#         common_ratio = common_words / len(title_words)
#         print(f"Title: {title}, Common words: {common_words}, Common ratio: {common_ratio}")
#         if common_ratio >= threshold:
#             filtered_titles.append(title)
#     return filtered_titles
    return recommended_titles

In [46]:
# testing -1
user_prompt = 'hey i am looking for a book to learn python'
recommend_books(user_prompt, model= nn_model,df= df)

['Python for Professionals',
 'Python for Everyone',
 'Basic Python for Data Management, Finance, and Marketing',
 'Python Without Fear',
 'Introduction to Python Programming',
 'Clean Python',
 'PYTHON CODING',
 'Quick Python 3',
 'Head First Python',
 'Python All-in-One For Dummies']

In [49]:
# testing -2 
user_prompt = 'I love spirituality, can you recommend me a book for meditation'
recommend_books(user_prompt, nn_model, df)

['Aspects of Meditation Book 4',
 'The Meditation Bible',
 'Aspects of Meditation Book 2',
 'Meditations of the Heart',
 'The Mystery and Magic of Love',
 'The Way to Love',
 'Meditation Techniques For Beginners: The Daily Meditation Ritual Lifestyle: The Best Kept Secrets about Meditation Techniques, Meditation Exercises, Meditation Transcendental & Meditation Motivation',
 'The Joy of Meditating',
 'The Spiritual Guide to Attracting Love',
 'Essence of Mind']

In [51]:
# testing -3
# this prompt asks for the type of book which is not in the data we fed the model with.
user_prompt = 'I want to learn math for my exam, suggest me some good books for math'
recommend_books(user_prompt, nn_model, df)

['Math Adventures with Python',
 'Python for Teenagers',
 'Microsoft Excel 2019 Programming Pocket Primer',
 'Microsoft Excel 2019 Programming Pocket Primer',
 'Doing Math with Python',
 'Quick JavaScript',
 'Machine Learning with Python for Everyone',
 'Quick Python 3',
 'Microsoft Access 2021 Programming Pocket Primer',
 'Microsoft Access 2021 Programming Pocket Primer']

In [52]:
# testing -4
user_prompt = 'How to learn investing in market'
recommend_books(user_prompt, nn_model, df)

['Profiting With Forex',
 'The Neatest Little Guide to Stock Market Investing',
 'Volatile Markets Made Easy',
 'Building Wealth in the Stock Market',
 'Market Neutral Investing',
 'Stocks for the Long Run: The Definitive Guide to Financial Market Returns & Long-Term Investment Strategies, Sixth Edition',
 'The Daily Telegraph Guide to Investing',
 'Commodities for Every Portfolio',
 'Trading and Investing in the Forex Markets Using Chart Techniques',
 'Clever Girl Finance']