In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import sys, os
os.chdir(sys.path[0])

## Pre-processing (punctuation, stopwords, lemmatization)

In [2]:
lemmatizer = WordNetLemmatizer()
books = os.listdir("books")
stopwords = open('stopwords.txt', 'r').read().split()
final = []

for book in books:
    cleaned_book = ''
    text = open('books/' + book, mode='r', encoding='utf-8').read().lower().split()
    for i in range(0, len(text)):
        text[i] = text[i].strip('.,;:?!\\|/[]_£$€%&#+*°“"├ª®º╝·=()_\ufeff')
    for word in text:
        if word not in stopwords:
            cleaned_book += lemmatizer.lemmatize(word) + " "
    final.append(str(cleaned_book))

In [5]:
print('total corpus: ', len(final))

total corpus:  52


# Latent Semantic Analysis

### 1. Document-Term Matrix

In [6]:
vectorizer = TfidfVectorizer(
                            lowercase=True,
                            max_df=0.8,
                            min_df=2,
                            stop_words="english"
                            )

### returns TF-IDF weighted DOCUMENT-TERM matrix
vectors = vectorizer.fit_transform(final)

### array mapping from feature integer indices to feature name
feature_names = vectorizer.get_feature_names()
# for col in vectors.nonzero()[1]:
#     print(feature_names[col], ' - ', vectors[0, col])

### returns a DENSE MATRIX representation of this matrix
dense = vectors.todense()
denselist = dense.tolist()

all_keywords = []

for book in denselist:
    i=0
    keywords = []
    for word in book:
        if word > 0:
            keywords.append(feature_names[i])
        i = i+1
    all_keywords.append(keywords)

df = pd.DataFrame(data=vectors.todense(), columns=vectorizer.get_feature_names())
document_term_matrix = df.T

titles = {}
i = 0
for book in books:
    titles[i] = book.replace('-', ' ').replace('.txt', '').title()
    i += 1

document_term_matrix.rename(columns = titles, inplace = True)
print(len(document_term_matrix))

# document_term_matrix.to_csv(r'matrix_v2.csv')

33159


### 2. Singular Value Decomposition

In [7]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=3)
lsa = svd.fit_transform(vectors)

topic_encoded_df = pd.DataFrame(lsa, columns = ['topic1', 'topic2', 'topic3'])
topic_encoded_df['title'] = titles.values()
display(topic_encoded_df[['title', 'topic1', 'topic2', 'topic3']].head(10))

encoded_for_cosine = pd.DataFrame(lsa, columns = ['topic1', 'topic2', 'topic3'])

titles.values()

Unnamed: 0,title,topic1,topic2,topic3
0,Alice In Wonderland,0.115088,0.226298,0.078891
1,Among The Forest People,0.234347,0.296164,0.137414
2,An Introductory Course Of Quantitative Chemica...,0.138241,-0.270541,0.538068
3,Curious Myths Of The Middle Ages,0.519458,0.012637,-0.028305
4,Democracy In America,0.53975,-0.33419,-0.280155
5,Experiments And Observations,0.182559,-0.218402,0.394938
6,Formation Of The Union 1750 1829,0.414639,-0.278469,-0.341966
7,Histories Of Two Hundred And Fifty One Divisio...,0.13076,-0.073247,-0.012782
8,History Of King Charles,0.330135,-0.029929,-0.204423
9,History Of Phosphorus,0.178261,-0.258615,0.456015


dict_values(['Alice In Wonderland', 'Among The Forest People', 'An Introductory Course Of Quantitative Chemical Analysis', 'Curious Myths Of The Middle Ages', 'Democracy In America', 'Experiments And Observations', 'Formation Of The Union 1750 1829', 'Histories Of Two Hundred And Fifty One Divisions Of The German Army', 'History Of King Charles', 'History Of Phosphorus', 'How The Flag Became Old Glory', 'Little Lord Fauntleroy', 'Medieval People', 'Mother Stories', 'Narrative And Critical History Of America', 'O Pioneers', 'Peter Pan', 'Prince Prigio', 'Sandman_S Goodnight Stories', 'The Chemistry Of Cookery', 'The Complete Herbal', 'The Eighteenth Brumaire Of Louis Bonaparte', 'The Elements Of Blowpipe Analysis', 'The Fauna Of The Deep Sea', 'The Foundations Of The Origin Of Species', 'The French Revolution', 'The Gases Of The Atmosphere The History Of Their Discovery By William Ramsay', 'The Great Republic', 'The Handbook Of Soap Manufacture', 'The History Of England From The Accessi

## Top 10 words by Topics

In [8]:
import numpy as np

dictionary = vectorizer.get_feature_names()

encoding_matrix = pd.DataFrame(svd.components_, index=['topic1', 'topic2', 'topic3']).T
encoding_matrix["terms"] = dictionary

# creates absolute value columns
encoding_matrix['abs_topic1'] = np.abs(encoding_matrix['topic1'])
encoding_matrix['abs_topic2'] = np.abs(encoding_matrix['topic2'])
encoding_matrix['abs_topic3'] = np.abs(encoding_matrix['topic3'])

# sorts by the absolute values of topic 1
top10 = encoding_matrix.sort_values('abs_topic1', ascending=False).head(10)

for i in range(1, 4):
    top10 = encoding_matrix.sort_values('abs_topic' + str(i), ascending=False).head(10)
    top = "Topic " + str(i) + ": "
    for word in top10['terms']:
        top += word + ' '
    print(top)

Topic 1: king prince boy government american mother war sidenote law nation 
Topic 2: mother boy acid fairy princess alice prince peter king queen 
Topic 3: acid solution sidenote oil cc government temperature gram soap food 


## Books by Topics

In [9]:
tp = topic_encoded_df
bag1, bag2, bag3 = list(), list(), list()
bags = [bag1, bag2, bag3]

for i in range(0, 52):
    if tp['topic1'][i] > tp['topic2'][i] and tp['topic1'][i] > tp['topic3'][i]:
        bag1.append(tp['title'][i])
    elif tp['topic2'][i] > tp['topic1'][i] and tp['topic2'][i] > tp['topic3'][i]:
        bag2.append(tp['title'][i])
    else:
        bag3.append(tp['title'][i])

for i in range(0, 3):
    print('Topic ' + str(i + 1))
    print(bags[i])
    print('\n')

Topic 1
['Curious Myths Of The Middle Ages', 'Democracy In America', 'Formation Of The Union 1750 1829', 'Histories Of Two Hundred And Fifty One Divisions Of The German Army', 'History Of King Charles', 'How The Flag Became Old Glory', 'Little Lord Fauntleroy', 'Medieval People', 'Narrative And Critical History Of America', 'O Pioneers', 'Prince Prigio', 'The Complete Herbal', 'The Eighteenth Brumaire Of Louis Bonaparte', 'The Fauna Of The Deep Sea', 'The Foundations Of The Origin Of Species', 'The French Revolution', 'The Great Republic', 'The History Of England From The Accession Of James Ii', 'The History Of The Decline And Fall Of The Roman Empire', 'The Last Leaf', 'The Magic Of Oz', 'The Progress Of Invention In The Nineteenth', 'The Ruins', 'The Threefold Commonwealth', 'The United States Of America Part I', 'The White Feather', 'Tiger And Tom And Other Stories For Boys', 'Tom Sawyer Abroad', 'War And Peace']


Topic 2
['Alice In Wonderland', 'Among The Forest People', 'Mother S

## Cosine Similarity

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

cs = pd.DataFrame(cosine_similarity(encoded_for_cosine, dense_output=True))
cs.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,42,43,44,45,46,47,48,49,50,51
0,1.0,0.981167,-0.017436,0.436656,-0.193029,0.021197,-0.262697,-0.063392,0.145785,-0.013809,...,0.928499,-0.106732,0.090945,-0.224971,0.792231,0.990414,0.999564,0.90853,0.866102,0.772131
1,0.981167,1.0,0.105543,0.581378,-0.039385,0.165445,-0.13275,0.118999,0.258194,0.125129,...,0.957935,0.077038,0.226285,-0.081919,0.880984,0.997868,0.9862,0.956663,0.931317,0.85354
2,-0.017436,0.105543,1.0,0.165327,0.033317,0.986727,-0.137269,0.33368,-0.233777,0.993635,...,0.354973,0.351269,0.988642,-0.047938,0.102738,0.046515,-0.006041,0.008676,0.046037,-0.004146
3,0.436656,0.581378,0.165327,1.0,0.786731,0.31931,0.703332,0.860544,0.872868,0.265224,...,0.47784,0.835637,0.303098,0.751787,0.894905,0.55167,0.46248,0.76919,0.826755,0.899722
4,-0.193029,-0.039385,0.033317,0.786731,1.0,0.18022,0.982464,0.945,0.908306,0.142853,...,-0.159002,0.94399,0.114406,0.996112,0.438013,-0.069109,-0.165893,0.234425,0.321204,0.474389
5,0.021197,0.165445,0.986727,0.31931,0.18022,1.0,0.004951,0.475497,-0.073463,0.998347,...,0.387631,0.48961,0.996133,0.097539,0.226382,0.103305,0.036729,0.109275,0.157003,0.125818
6,-0.262697,-0.13275,-0.137269,0.703332,0.982464,0.004951,1.0,0.867474,0.912547,-0.029869,...,-0.287718,0.866856,-0.064852,0.995065,0.34701,-0.152298,-0.238205,0.155928,0.237287,0.402256
7,-0.063392,0.118999,0.33368,0.860544,0.945,0.475497,0.867474,1.0,0.818913,0.437296,...,0.073354,0.998868,0.421801,0.912514,0.552153,0.073717,-0.033949,0.343022,0.432241,0.552012
8,0.145785,0.258194,-0.233777,0.872868,0.908306,-0.073463,0.912547,0.818913,1.0,-0.123486,...,0.05844,0.799011,-0.115457,0.915116,0.664702,0.247486,0.169285,0.527989,0.589336,0.721599
9,-0.013809,0.125129,0.993635,0.265224,0.142853,0.998347,-0.029869,0.437296,-0.123486,1.0,...,0.357047,0.453488,0.994474,0.061125,0.172319,0.0635,0.00058,0.058779,0.104653,0.070138


In [12]:
def best_recommended_books(book, nbr):
    
    recommendations = []
    nbr += 1
    
    for i in range(0, 52):
        
        if book.lower() == tp['title'][i].lower():
            target = cs[i].sort_values(ascending=False)
            df_cos = pd.DataFrame({'id':target.index, 'cosine':target.values}).head(nbr)
            
            for i in range(1, nbr):
                id = df_cos.iloc[i, 0]
                reco = (tp['title'][id], df_cos.iloc[i, 1])
                recommendations.append(reco)
    
    return recommendations

book = input('Enter Book: ')
nReco = int(input('Number of recommendations: '))
display(best_recommended_books(book, nReco))

Enter Book: history of phosphorus
Number of recommendations: 2


[('The Gases Of The Atmosphere The History Of Their Discovery By William Ramsay',
  0.9994705159253809),
 ('The Phase Rule And Its Applications', 0.9985759228743006)]

In [None]:
# # to print all titles

# for title in titles.values():
#     print(title)