### Stemming

In [31]:
# porter stemmer

import nltk
from nltk.stem import PorterStemmer

raw_text = """DENNIS: Listen, strange women lying in ponds distributing swords is no basis for a system of government. Supreme executive power derives from a mandate from the masses, not from some farcical aquatic ceremony."""

print(raw_text)
print('\n')

porter_stemmer = nltk.PorterStemmer()
tokens = nltk.word_tokenize(raw_text)
stemmed_words = [porter_stemmer.stem(w) for w in tokens]
print(stemmed_words)
print('\n')


stemmed_text = " ".join(stemmed_words)    
print(stemmed_text)

DENNIS: Listen, strange women lying in ponds distributing swords is no basis for a system of government. Supreme executive power derives from a mandate from the masses, not from some farcical aquatic ceremony.


['denni', ':', 'listen', ',', 'strang', 'women', 'lie', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'basi', 'for', 'a', 'system', 'of', 'govern', '.', 'suprem', 'execut', 'power', 'deriv', 'from', 'a', 'mandat', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcic', 'aquat', 'ceremoni', '.']


denni : listen , strang women lie in pond distribut sword is no basi for a system of govern . suprem execut power deriv from a mandat from the mass , not from some farcic aquat ceremoni .


In [33]:
# Snowballstemmer

import nltk
from nltk.stem.snowball import SnowballStemmer

raw_text2 = """DENNIS: Listen, strange women lying in ponds distributing swords is no basis for a system of government. Supreme executive power derives from a mandate from the masses, not from some farcical aquatic ceremony."""

print(raw_text2)
print('\n')

#the stemmer requires a language parameter
snow_stemmer = SnowballStemmer(language='english')

tokens = nltk.word_tokenize(raw_text1)
stemmed_words = [snow_stemmer.stem(w) for w in tokens]
print(stemmed_words)
print('\n')


stemmed_text = " ".join(stemmed_words)
print(stemmed_text)

DENNIS: Listen, strange women lying in ponds distributing swords is no basis for a system of government. Supreme executive power derives from a mandate from the masses, not from some farcical aquatic ceremony.


['denni', ':', 'listen', ',', 'strang', 'women', 'lie', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'basi', 'for', 'a', 'system', 'of', 'govern', '.', 'suprem', 'execut', 'power', 'deriv', 'from', 'a', 'mandat', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcic', 'aquat', 'ceremoni', '.']


denni : listen , strang women lie in pond distribut sword is no basi for a system of govern . suprem execut power deriv from a mandat from the mass , not from some farcic aquat ceremoni .


In [32]:
# lancasterstemmer

import nltk
from nltk.stem.lancaster import LancasterStemmer

raw_text1 = """DENNIS: Listen, strange women lying in ponds distributing swords is no basis for a system of government. Supreme executive power derives from a mandate from the masses, not from some farcical aquatic ceremony."""

print(raw_text1)
print('\n')

lancaster_stemmer = nltk.LancasterStemmer()
tokens = nltk.word_tokenize(raw_text1)
stemmed_words = [lancaster_stemmer.stem(w) for w in tokens]
print(stemmed_words)
print('\n')


stemmed_text = " ".join(stemmed_words)    
print(stemmed_text)

DENNIS: Listen, strange women lying in ponds distributing swords is no basis for a system of government. Supreme executive power derives from a mandate from the masses, not from some farcical aquatic ceremony.


['den', ':', 'list', ',', 'strange', 'wom', 'lying', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'bas', 'for', 'a', 'system', 'of', 'govern', '.', 'suprem', 'execut', 'pow', 'der', 'from', 'a', 'mand', 'from', 'the', 'mass', ',', 'not', 'from', 'som', 'farc', 'aqu', 'ceremony', '.']


den : list , strange wom lying in pond distribut sword is no bas for a system of govern . suprem execut pow der from a mand from the mass , not from som farc aqu ceremony .


### Lemmatization

In [34]:
raw_text = """DENNIS: Listen, strange women lying in ponds distributing swords is no basis for a system of government. Supreme executive power derives from a mandate from the masses, not from some farcical aquatic ceremony."""
raw_text

'DENNIS: Listen, strange women lying in ponds distributing swords is no basis for a system of government. Supreme executive power derives from a mandate from the masses, not from some farcical aquatic ceremony.'

In [35]:
import nltk
from nltk.stem import WordNetLemmatizer 
#nltk.download('wordnet')

tokens = nltk.word_tokenize(raw_text)
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(w) for w in tokens]
print(lemmatized_words)
print('\n')


lemmatized_text = " ".join(lemmatized_words)
print(lemmatized_text)

['DENNIS', ':', 'Listen', ',', 'strange', 'woman', 'lying', 'in', 'pond', 'distributing', 'sword', 'is', 'no', 'basis', 'for', 'a', 'system', 'of', 'government', '.', 'Supreme', 'executive', 'power', 'derives', 'from', 'a', 'mandate', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcical', 'aquatic', 'ceremony', '.']


DENNIS : Listen , strange woman lying in pond distributing sword is no basis for a system of government . Supreme executive power derives from a mandate from the mass , not from some farcical aquatic ceremony .


In [36]:
raw_text2 = "The striped bats are hanging on their feet for best."

tokens = nltk.word_tokenize(raw_text2)
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(w) for w in tokens]
print(lemmatized_words)
print('\n')


lemmatized_text = " ".join(lemmatized_words)
print(lemmatized_text)

['The', 'striped', 'bat', 'are', 'hanging', 'on', 'their', 'foot', 'for', 'best', '.']


The striped bat are hanging on their foot for best .


# Information Retrieval System

### Information retrieval system based on ranked retrieval

In [40]:
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from os import path
from nltk.stem import WordNetLemmatizer,PorterStemmer


porter_stemmer = nltk.PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [41]:
# reading documents
df = pd.read_csv('WordsDataset.csv')
df.head(20)

Unnamed: 0,docID,words
0,0,"Hiker, demon, creepy, scary, tunnel, stalk"
1,1,"Batman, batman beyond, who are you, narrows it..."
2,2,"Up, carl, russell, honor, award, scout badge, ..."
3,3,"Tom, jerry, sword, stab, dont care, cartoon, show"
4,4,"Wholesome, comic, dialogue bubble, dog, sleepi..."
5,5,"Doug dimmadome, chef hat, long, fast food, res..."
6,6,"Empty town, comparison, bustling city, contrad..."
7,7,"Lord of the rings, lotr, gandalf, pipip, sendi..."
8,8,"Geralt, yennefer, pointing, blame, slapstick, ..."
9,9,"Goofy, college, max ,shock, surprise, reveal, ..."


The docID column has been changed to now represent a document by 'D' followed by the number.

In [42]:
df.docID = pd.Series(["D"+str(ind) for ind in df.docID])
df.head(5)

Unnamed: 0,docID,words
0,D0,"Hiker, demon, creepy, scary, tunnel, stalk"
1,D1,"Batman, batman beyond, who are you, narrows it..."
2,D2,"Up, carl, russell, honor, award, scout badge, ..."
3,D3,"Tom, jerry, sword, stab, dont care, cartoon, show"
4,D4,"Wholesome, comic, dialogue bubble, dog, sleepi..."


Now, the words column is cleaned by the following steps:
- Remove punctuations
- Lower case
- Strip whitespaces
- Remove stopwords
- Stemming

In [43]:
df.words = df.words.str.replace(","," ")
df.words = df.words.str.replace(r'\W',' ')
df.words = df.words.str.strip().str.lower()
print(df.words)

0            hiker  demon  creepy  scary  tunnel  stalk
1     batman  batman beyond  who are you  narrows it...
2     up  carl  russell  honor  award  scout badge  ...
3     tom  jerry  sword  stab  dont care  cartoon  show
4     wholesome  comic  dialogue bubble  dog  sleepi...
5     doug dimmadome  chef hat  long  fast food  res...
6     empty town  comparison  bustling city  contrad...
7     lord of the rings  lotr  gandalf  pipip  sendi...
8     geralt  yennefer  pointing  blame  slapstick  ...
9     goofy  college  max  shock  surprise  reveal  ...
10    gordon ramsay  pepto bismol  patrick  feeding ...
11    groot  gunpoint  force  surreal  movie  despic...
12                 having enough  jump  slapstick  fall
13                                       cat  possessed
14                            hotdog  dog  many options
15    jedi  master  lightsaber  block  unexpected  m...
16    joker  you think this is funny  laugh  reactio...
17                 dark  kids  swing  police  de

  


Vocabulary (all uniqye words in the documents) are collected into a set as below. This set of vocabulary is used to match with the query.

In [44]:
# print(df.tags.values)

all_text = " ".join(df.words.values)
vocab = np.unique(word_tokenize(all_text))
vocab = [word for word in vocab if word not in stopwords.words('english')]

print(vocab)

['2', '25', '4', 'alone', 'animated', 'anime', 'announce', 'answer', 'arm', 'award', 'badge', 'batman', 'battle', 'bean', 'bear', 'bench', 'better', 'beyond', 'big', 'bigger', 'bismol', 'biting', 'black', 'blame', 'blessing', 'block', 'body', 'bounds', 'brain', 'broom', 'bubble', 'bustling', 'callmecarson', 'cardboard', 'care', 'carl', 'carrying', 'cartoon', 'cast', 'cat', 'change', 'charlie', 'chef', 'city', 'classic', 'cody', 'college', 'combine', 'comic', 'comparison', 'contradict', 'contradictory', 'creative', 'creepy', 'crossover', 'crusade', 'crying', 'cutout', 'dancing', 'dark', 'dead', 'demon', 'despicable', 'destroy', 'dialogue', 'dimmadome', 'dislike', 'diver', 'dog', 'dont', 'doug', 'dragonballz', 'draw', 'drink', 'employee', 'empty', 'enough', 'excuse', 'face', 'fall', 'fallout', 'fast', 'feeding', 'ferb', 'fight', 'fire', 'fit', 'food', 'fool', 'football', 'forbidden', 'force', 'fortress', 'freeze', 'frozone', 'funny', 'fusion', 'game', 'gandalf', 'geralt', 'girl', 'god', 

In [45]:
print(len(vocab))

272


A term-document-matrix is a mapping of every word in the vocabulary to document. Every document is converted to a vector corresponding to frequency of each word appearing in that document.

In [46]:
def term_document_matrix(data, vocab= None, document_index= 'ID', text= 'text'):
    """Calculate frequency of term in the document.
    
    parameter: 
        data: DataFrame. 
        Frequency of word calculated against the data.
        
        vocab: list of strings.
        Vocabulary of the documents    
        
        document_index: str.
        Column name for document index in DataFrame passed.
        
        text: str
        Column name containing text for all documents in DataFrame,
        
    returns:
        vocab_index: DataFrame.
        DataFrame containing term document matrix.
        """
    
    vocab_index = pd.DataFrame(columns=df[document_index], index= vocab).fillna(0)
    
    for word in vocab_index.index:
        
        for doc in data[document_index]:
            
            freq = data[data[document_index] == doc][text].values[0].count(word)
            vocab_index.loc[word,doc] = freq
    
    return vocab_index

In [47]:
similarity_index = term_document_matrix(df,vocab,'docID','words')
similarity_index

docID,D0,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D44,D45,D46,D47,D48,D49,D50,D51,D52,D53
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
alone,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
animated,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
woman,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
women,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
wwe,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yennefer,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


From the term-document-matrix the inverse-document-frequncy is calculated for every document. Using the term-frequencies (tf) and the inverse-document-frequency (idf) the tf-idf score for every word in every document is computed below.

In [48]:
def tf_idf_score(vocab_index, document_index, inv_df= 'inverse_document_frequency'):
    """
    Calculate tf-idf score for vocabulary in documents
    
    parameter:
        vocab_index: DataFrame.
        Term document matrix.
        
        document_index: list or tuple.
        Series containing document ids.
        
        inv_df: str.
        Name of the column with calculated inverse document frequencies.
        
    returns:
        vocab_index: DataFrame.
        DataFrame containing term document matrix and document frequencies, inverse document frequencies and tf-idf scores
    """
    total_docx = len(document_index)
    vocab_index['document_frequency'] = vocab_index.sum(axis= 1)
    vocab_index['inverse_document_frequency'] = np.log2( total_docx / vocab_index['document_frequency'])
    
    for word in vocab_index.index:
        
        for doc in document_index:
            
                tf_idf = np.log2(1 + vocab_index.loc[word,doc]) * np.log2(vocab_index.loc[word][inv_df])
                vocab_index.loc[word,'tf_idf_'+doc] = tf_idf
    
    return vocab_index

In [49]:
similarity_index = tf_idf_score(similarity_index, df.docID.values)
similarity_index

docID,D0,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,tf_idf_D44,tf_idf_D45,tf_idf_D46,tf_idf_D47,tf_idf_D48,tf_idf_D49,tf_idf_D50,tf_idf_D51,tf_idf_D52,tf_idf_D53
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
25,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
alone,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
animated,0,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
woman,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.524788,0.000000,0.0
women,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,2.524788,0.0
wwe,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
yennefer,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0


The calculation of the above is computation intensive and is only required to do so once unless the database containing documents changes. Thus the above dataframe containing all tf, idfs and the tf-idf scores are saved.

In [50]:
similarity_index.to_csv('term_doc_matrix.csv')

In [51]:
test= pd.read_csv('term_doc_matrix.csv')
test = test.set_index('Unnamed: 0')

A huge assumption which is true is that the user will not input the query in a set format. Hence on the same lines as cleaning the tags, the query is also cleaned :

- Remove punctutations
- Lower case
- Remove whitespaces
- Remove stop words
- Stemming

In [52]:
def query_processing(query):
    """
    Pre-processing query to accomodate calculations for tf-idf score
    
    parameter:
        query: str.
        Textual query input to the system.
        
    returns:
        query: str.
        Cleaned string.
        """
    query= re.sub('\W',' ',query)
    query= query.strip().lower()
    query= " ".join([word for word in query.split() if word not in stopwords.words('english')])
    
    return query

In [53]:
query = "movie for kids"
query_processing(query)

'movie kids'

For every term in the query, if it exists in the vocabulary, then its tf-idf score is calculated and appended to the matrix.

In [54]:
def query_score(vocab_index, query):
    """
    Calculate tf-idf score for query terms
    
    parameter:
        vocab_index: DataFrame.
        Term document matrix with inverse document frequency and term frequencies calculated.
        
        query: str.
        Query submitted to the system
        
    returns:
        vocab_index: DataFrame.
        Term document matrix with tf-idf scores for terms per document and query terms.
    """
    for word in np.unique(query.split()):
        
        freq = query.count(word)
        
        if word in vocab_index.index:
            
            tf_idf = np.log2(1+freq) * np.log2(vocab_index.loc[word].inverse_document_frequency)
            vocab_index.loc[word,"query_tf_idf"] = tf_idf
            vocab_index['query_tf_idf'].fillna(0, inplace=True)
    
    return vocab_index

In [55]:
query= "movie for kids"
similarity_index = query_score(test,query)
similarity_index

Unnamed: 0_level_0,D0,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,tf_idf_D45,tf_idf_D46,tf_idf_D47,tf_idf_D48,tf_idf_D49,tf_idf_D50,tf_idf_D51,tf_idf_D52,tf_idf_D53,query_tf_idf
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
25,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
alone,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
animated,0,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
woman,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.524788,0.000000,0.0,0.0
women,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,2.524788,0.0,0.0
wwe,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
yennefer,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0


The last step is to find the cosine similarities between the query and documents. The cosine similarity determines how similar the two vectors are - Document vector and the query vector.

In [56]:
def cosine_similarity(vocab_index, document_index, query_scores):
    """
    Calculates cosine similarity between the documents and query
    
    parameter:
        
        vocab_index: DataFrame.
        DataFrame containing tf-idf score per term for every document and for the query terms.
        
        document_index: list.
        List of document ids.
        
        query_scores: str.
        Column name in DataFrame containing query term tf-idf scores.
        
    returns:
        cosine_scores: Series.
        Cosine similarity scores of every document.
    """
    cosine_scores = {}
    
    query_scalar = np.sqrt(sum(vocab_index[query_scores] ** 2))
    
    for doc in document_index:
        
        doc_scalar = np.sqrt(sum(vocab_index[doc] ** 2))
        dot_prod = sum(vocab_index[doc] * vocab_index[query_scores])
        cosine = (dot_prod / (query_scalar * doc_scalar))
        
        cosine_scores[doc] = cosine
        
    return pd.Series(cosine_scores)

In [57]:
cosines = cosine_similarity(similarity_index, df.docID.values, 'query_tf_idf')
cosines

D0     0.000000
D1     0.000000
D2     0.411276
D3     0.000000
D4     0.000000
D5     0.000000
D6     0.000000
D7     0.175137
D8     0.000000
D9     0.000000
D10    0.000000
D11    0.202231
D12    0.000000
D13    0.000000
D14    0.000000
D15    0.175137
D16    0.202231
D17    0.354640
D18    0.156647
D19    0.187230
D20    0.000000
D21    0.142999
D22    0.000000
D23    0.000000
D24    0.000000
D25    0.000000
D26    0.000000
D27    0.000000
D28    0.000000
D29    0.000000
D30    0.000000
D31    0.000000
D32    0.000000
D33    0.000000
D34    0.000000
D35    0.000000
D36    0.000000
D37    0.000000
D38    0.000000
D39    0.000000
D40    0.000000
D41    0.000000
D42    0.165121
D43    0.000000
D44    0.000000
D45    0.000000
D46    0.000000
D47    0.000000
D48    0.000000
D49    0.000000
D50    0.165121
D51    0.000000
D52    0.000000
D53    0.000000
dtype: float64

Once the cosine score for every document with the query is calculated. The documents are ranked with respect to their score. The top 'k' documents, here 10, are retrieved in the form of indices.

In [58]:
def retrieve_index(data,cosine_scores, document_index):
    """
    Retrieves indices for the corresponding document cosine scores
    
    parameters:
        data: DataFrame.
        DataFrame containing document ids and text.
        
        cosine_scores: Series.
        Series containing document cosine scores.
        
        document_index: str.
        Column name containing document ids in data.
        
    returns:
        data: DataFrame.
        Original DataFrame with cosine scores added as column.
    """
    
    data = data.set_index(document_index)
    data['scores'] = cosine_scores
    
    return data.reset_index().sort_values('scores',ascending=False).head(10).index

In [59]:
indices = retrieve_index(df, cosines, 'docID')
indices

Int64Index([2, 17, 16, 11, 19, 15, 7, 50, 42, 18], dtype='int64')

These indices are the top 10 most relevant according to the query.  
The function below, summarizes, by calling, all the above written individual functions. Only the below functions needs to be called to run the system and retrieve indices for a query.

In [60]:
def information_system(query):
    """
    Perform a retrieval from the indexes based on the query 
    and return the document ids that are similar to the query
    
    paramters:
        query: str.
        Query submitted to the system.
        
    returns:
        indices: list.
        List of document indices which are most relevant to the query.
    """
    
    df = pd.read_csv('TagsDatabase.csv')

 
    df.docID = pd.Series(["D"+str(ind) for ind in df.docID])

    df.words = df.words.str.replace(","," ")
    df.words = df.words.str.replace(r'\W',' ')
    df.words = df.words.str.strip().str.lower()
    
    if not path.exists('term_doc_matrix.csv'):    

        all_text = " ".join(df.words.values)
        vocab = np.unique(word_tokenize(all_text))
        vocab = [word for word in vocab if word not in stopwords.words('english')]

        similarity_index = term_document_matrix(df,vocab,'docID','words')
        similarity_index = tf_idf_score(similarity_index, df.docID.values)
        
    else:
        similarity_index = pd.read_csv('term_doc_matrix.csv')
        similarity_index = similarity_index.set_index('Unnamed: 0')
        
    query = query_processing(query)
    similarity_index = query_score(similarity_index,query)
    
    cosines = cosine_similarity(similarity_index, df.docID.values, 'query_tf_idf')
    indices = retrieve_index(df, cosines, 'docID')
    
    return list(indices)

In [61]:
information_system('movie for kids')

#information_system('lack of intelligence')



[2, 17, 16, 11, 19, 7, 15, 42, 50, 18]