In [1]:
import fitz
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

In [2]:

def getText(filePath):
    '''Get text from pdf file'''
    doc = fitz.open(filePath)
    text = ""
    for page in doc:
        text+=page.get_text()
    return text


def remove_stopwords(text):
    '''Remove stop word'''
    stop_words = set(stopwords.words('english')) # Define the set of English stopwords
    words = nltk.word_tokenize(text) # Tokenize the input text
    filtered_words = [word for word in words if word.lower() not in stop_words] # Remove stopwords
    return ' '.join(filtered_words) # Join the filtered words into a string

def stem_words(text):
    '''convert to root word'''
    word_tokens = nltk.word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return ' '.join(stems)

def clean_text(filePath):
    text = getText(filePath)
    text = text.lower()
    text = re.sub(r'http\S+', '', text) # remove url
    # text = ''.join([i for i in text if not i.isdigit()]) #remove number
    text = re.sub(r'\d+', '', text) #remove number
    text = re.sub(r'[^\w\s]', '', text) # remove special character, white space
    text = remove_stopwords(text) # remove stop word
    text = stem_words(text) #conver words to root words
    text = text.encode('ascii', 'ignore').decode() #remove character not ascii
    return text


In [3]:
import os

listFile = os.listdir('documents')
listFile

['1  CLASS ACTION SETTLEMENT AGREEMENT AND RELEASE 5.pdf',
 '1 Approved April 9 2010 Revised April 12 2019 AST Guidelines for.pdf',
 '100 Philosophy parapsychology and occultism psychology.pdf',
 '2009_10_13_MWC_STM_Report.pdf',
 '2023 Congressional Art Competition  Student Information.pdf',
 '900 History geography and auxiliary disciplines.pdf',
 'A CrossLingual Dictionary for English Wikipedia Concepts.pdf',
 'A Plan for a.pdf',
 'An overview of scientific and scholarly journal publishing.pdf',
 'Annex 11 Computerised Systems.pdf',
 'Annual Convension 2015 Key Issues emerging out of Panel.pdf',
 'AP World History Modern Course and Exam Description Effective.pdf',
 'Art  Finance Report 2017 5th edition.pdf',
 'ART BUILDING  ART ANNEX AB.pdf',
 'Arts Education.pdf',
 'Automatically Assessing the Quality of Wikipedia Articles.pdf',
 'Brick by Brick Wikipedia and Libraries building on each other.pdf',
 'Central bank digital currencies system design and interoperability.pdf',
 'Cold Weath

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
listText = []
for file in listFile:
    # path = os.path.join('documents', file)
    # print(path)
    text = clean_text(os.path.join('documents', file))
    listText.append(text)
    

In [6]:
query = clean_text('Multimedia50-55.pdf')

In [7]:
tfidfvec = TfidfVectorizer()
X = tfidfvec.fit_transform(listText)
X

In [8]:
queryVec = tfidfvec.transform([query])

In [9]:
similarity = cosine_similarity(queryVec, X)
similarity

array([[0.03053217, 0.02230474, 0.03087921, 0.07050303, 0.03336728,
        0.02131897, 0.03066563, 0.03130259, 0.07050303, 0.10014076,
        0.04384613, 0.06419728, 0.03519373, 0.00239299, 0.06079113,
        0.09933763, 0.03183094, 0.05503372, 0.01256906, 0.05378003,
        0.00274584, 0.03579042, 0.02520821, 0.04232854, 0.0222257 ,
        0.0222257 , 0.03482099, 0.08139815, 0.07460772, 0.03627958,
        0.00847478, 0.02544231, 0.03798847, 0.03070185, 0.04236948,
        0.10522281, 0.10522281, 0.05116392, 0.05285894, 0.03043238,
        0.08967053, 0.0346895 , 0.07594876, 0.07594876, 0.07594876,
        0.08749575, 0.0246227 , 0.03643701, 0.05278263, 0.10891718,
        0.10891718, 0.04602932, 0.05867589, 0.06187553, 0.04925892,
        0.07342382, 0.04213997, 0.01027177, 0.01440987, 0.01440987,
        0.06029825, 0.06029825, 0.05537146, 0.06706447, 0.00489827,
        0.01288668, 0.03891997, 0.01629503, 0.05058909, 0.05614433,
        0.01517361, 0.0370552 , 0.06159847, 0.01

In [10]:
similar_indices = similarity.argsort()[0][::-1][:10]
res = [{listFile[i], similarity[0][i]} for i in similar_indices]
res

[{0.9999999999999998, 'Multimedia50-55.pdf'},
 {0.6258670716936332,
  'Multimedia_Database_Management_Systems_(Artech House).pdf'},
 {0.5531382756761338,
  'Multimedia_Database_Management_Systems_(Artech House)-20-50.pdf'},
 {0.10891718314247391,
  'How to Write a Paper in Scientific Journal Style and Format.pdf'},
 {0.10891718314247391, 'How-to-Write-Guide-v10-2014.pdf'},
 {0.10688598191420823, 'Placing a text in context.pdf'},
 {0.10522281206892091,
  'Garfield E Citation Indexes for Science A New Dimension in .pdf'},
 {0.10522281206892091,
  'Garfield E Citation Indexes for Science A New Dimension in.pdf'},
 {0.10135324199444724, 'Why We Read Wikipedia.pdf'},
 {0.10014076442357916, 'Annex 11 Computerised Systems.pdf'}]

In [11]:
countVec = CountVectorizer()
X2 = countVec.fit_transform(listText)

In [12]:
queryVec2 = countVec.transform([query])

In [13]:
similarity2 = cosine_similarity(queryVec, X2)
similarity2

array([[0.05082019, 0.05084523, 0.04608237, 0.08639682, 0.0520884 ,
        0.03046109, 0.06803744, 0.05734435, 0.08639682, 0.12577217,
        0.09230597, 0.08223198, 0.04737802, 0.00543902, 0.08193817,
        0.12234327, 0.05958389, 0.08704198, 0.02545491, 0.07627758,
        0.00547452, 0.06084274, 0.03950761, 0.06193417, 0.0316777 ,
        0.0316777 , 0.06449152, 0.10245943, 0.12233114, 0.05685766,
        0.01384885, 0.04299444, 0.07696803, 0.05725396, 0.06632632,
        0.11756723, 0.11756723, 0.0670254 , 0.08170698, 0.04744863,
        0.1403118 , 0.07215952, 0.0868921 , 0.0868921 , 0.0868921 ,
        0.12183119, 0.05817276, 0.04493053, 0.08545634, 0.1327158 ,
        0.1327158 , 0.07148666, 0.08837349, 0.09028543, 0.06634609,
        0.11926215, 0.066561  , 0.02254099, 0.02670534, 0.02670534,
        0.09712853, 0.09712853, 0.08518395, 0.08883541, 0.0127747 ,
        0.02357098, 0.06421224, 0.02918967, 0.0985806 , 0.0736253 ,
        0.02588113, 0.0565113 , 0.08484843, 0.02

In [14]:
similar_indices2 = similarity2.argsort()[0][::-1][:10]
res2 = [{listFile[i], similarity2[0][i]} for i in similar_indices2]
res2

[{0.9382124299610846, 'Multimedia50-55.pdf'},
 {0.6063929709223785,
  'Multimedia_Database_Management_Systems_(Artech House)-20-50.pdf'},
 {0.5721515178839358,
  'Multimedia_Database_Management_Systems_(Artech House).pdf'},
 {0.1403118005999232, 'GPT4 System Card  OpenAI.pdf'},
 {0.13579189803604605, 'Why We Read Wikipedia.pdf'},
 {0.13271579615139978,
  'How to Write a Paper in Scientific Journal Style and Format.pdf'},
 {0.13271579615139978, 'How-to-Write-Guide-v10-2014.pdf'},
 {0.12577216670106567, 'Annex 11 Computerised Systems.pdf'},
 {0.12234326840661322,
  'Automatically Assessing the Quality of Wikipedia Articles.pdf'},
 {0.12233114387285227,
  'Evolution and Link Prediction of the Wikipedia Network.pdf'}]

In [15]:
import pickle

In [16]:
with open('dele\\listFile.pickle', 'wb') as file:
    pickle.dump(listFile, file)

with open('dele\\listText.pickle', 'wb') as file:
    pickle.dump(listText, file)

with open('dele\\tfidfvec.pickle', 'wb') as file:
    pickle.dump(tfidfvec, file)

with open('dele\\countVec.pickle', 'wb') as file:
    pickle.dump(countVec, file)

with open('dele\\tfidf-file-to-vec.pickle', 'wb') as file:
    pickle.dump(X, file)
    
with open('dele\\count-file-to-vec.pickle', 'wb') as file:
    pickle.dump(X2, file)
