In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import nltk
import re
from nltk.stem import PorterStemmer
import os

In [2]:
#define a stemmer
def stemming(words):
    words = re.sub('[^A-Za-z0-9]+', ' ', words)
    words = nltk.word_tokenize(words.lower())
    ps = PorterStemmer() 
    stems = str([ps.stem(term) for term in words])
    return stems

In [3]:
#read files in the folder into one dictionary
def read_folder(folder_path):
    dirs = os.listdir(folder_path)
    file_dict = {}
    for file in dirs:
        #filter out trash file 
        if 'html' in file:
            file = folder_path + file
            with open (file,'r') as md_file:
                content = md_file.read()
            file_dict[os.path.basename(file)] = content 
            
    return file_dict
    
file_dict = read_folder('/Users/mac/Documents/I427/pa2/file/')

In [4]:
len(file_dict)

50

In [5]:
for key, value in file_dict.items():
    file_dict[key] = stemming(value)

In [6]:
# fit the data to the tf-idf vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english',tokenizer=stemming)
tfidf_vectorizer = TfidfVectorizer()
sparse_matrix = tfidf_vectorizer.fit_transform(file_dict.values())

# display the output in dataframe
doc_term_matrix = sparse_matrix.todense()
df = pd.DataFrame(doc_term_matrix, 
                  columns=tfidf_vectorizer.get_feature_names(),
                  index = file_dict.keys())
df

Unnamed: 0,000,0002,00322,00488,00488281,00488c0,005c5c,006298,0085,0125,...,zeta,zhu,zimm,znoebhff,zobaqrff,zoher,zoo,zoom,zouggla,zrzylapu
23.html,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.015764,0.0,0.002637,0.0,0.0
35.html,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9.html,0.003392,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19.html,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009134,0.0,0.0
39.html,0.00321,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5.html,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15.html,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42.html,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43.html,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14.html,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
#calculate tf-idf score for each document and sort the values in descending order
tf_idf = dict(df.sum(axis = 1).sort_values(ascending=False))
tf_idf
dict(list(tf_idf.items())[0: 5])  

{'0.html': 16.021481080347808,
 '7.html': 9.67636826323058,
 '47.html': 9.655272381887698,
 '40.html': 9.272131077575324,
 '23.html': 9.168296342271427}

In [8]:
#read the dat file which contains the url name
index_dat = pd.read_csv('file/index.dat', index_col=0)
index_dat.head()

Unnamed: 0,url
0,https://www.indiana.edu
1,https://luddy.indiana.edu/about/index.html
2,https://luddy.indiana.edu/about/vision-mission...
3,https://luddy.indiana.edu/about/leadership.html
4,https://luddy.indiana.edu/about/diversity/inde...


In [10]:
#convert '0.html' to real name
mapping_dict = dict(index_dat['url'])
for key in tf_idf.keys():
    for kk2 in mapping_dict.keys():
        if str(kk2) + '.html' == key:
            tf_idf[mapping_dict[kk2]] = tf_idf.pop(key) 

In [11]:
dict(list(tf_idf.items())[0: 5])

{'https://www.indiana.edu': 16.021481080347808,
 'https://luddy.indiana.edu/about/advisory-council.html': 9.67636826323058,
 'https://luddy.indiana.edu/research/index.html': 9.655272381887698,
 'https://luddy.indiana.edu/academics/innovation-entrepreneurship/index.html': 9.272131077575324,
 'https://luddy.indiana.edu/admissions/student-ambassadors.html': 9.168296342271427}

In [16]:
#convert '0.html' to real name
for key in file_dict.keys():
    for kk2 in mapping_dict.keys():
        if str(kk2) + '.html' == key:
            file_dict[mapping_dict[kk2]] = file_dict.pop(key) 

In [18]:
#define search common words function
lst = []
sub_dict = {}
ranking_dict = {}

def search_words(file_dict, tf_idf, df):
    #get user input as a list
    user_input = list(input("Please enter the term(s) you want to search: ").strip().split())
    #check if all input terms exist in the dict
    if all(word in df.columns for word in user_input):
        for word in user_input:
            for key, value in file_dict.items():
                if word in value:
                    lst.append(key)
        #print(lst)
        #check if there is document contain all the search terms
        result = set([x for x in lst if lst.count(x) > 1])
        if result: 
            for file in result:
                sub_dict[file] = tf_idf[file]

            # display the result in descending order
            for k in sorted(sub_dict, key=sub_dict.get, reverse=True):
                print(k, sub_dict[k])
        else:
            print("Common pages don't exist.")
    else: 
        print("Invalid input! Please try again.")

In [14]:
#e.g.
search_words(file_dict, tf_idf, df)

Please enter the term(s) you want to search: cdscdc support
Invalid input! Please try again.


In [19]:
#e.g.
search_words(file_dict, tf_idf, df)

Please enter the term(s) you want to search: support sure
https://luddy.indiana.edu/academics/undergraduate-programs/index.html 8.490372600919274
https://luddy.indiana.edu/admissions/living-learning-center.html 7.640693487520203
https://luddy.indiana.edu/about/fred-luddy.html 7.285768409783447
https://luddy.indiana.edu/admissions/apply/transfer.html 6.7035379711439855
https://luddy.indiana.edu/admissions/apply/freshmen.html 6.348348557297272
