In [11]:
import fitz
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

In [12]:

def getText(filePath):
    '''Get text from pdf file'''
    doc = fitz.open(filePath)
    text = ""
    for page in doc:
        text+=page.get_text()
    return text


def remove_stopwords(text):
    '''Remove stop word'''
    stop_words = set(stopwords.words('english')) # Define the set of English stopwords
    words = nltk.word_tokenize(text) # Tokenize the input text
    filtered_words = [word for word in words if word.lower() not in stop_words] # Remove stopwords
    return ' '.join(filtered_words) # Join the filtered words into a string

def stem_words(text):
    '''convert to root word'''
    word_tokens = nltk.word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return ' '.join(stems)

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return re.sub(url_pattern, '', text)

def clean_text(filePath):
    text = getText(filePath)
    text = text.lower()
    text = remove_urls(text) # remove url
    # text = ''.join([i for i in text if not i.isdigit()]) #remove number
    text = re.sub(r'\d+', '', text) #remove number
    text = re.sub(r'[^\w\s]', '', text) # remove special character
    text = remove_stopwords(text) # remove stop word
    text = stem_words(text) #conver words to root words
    text = re.sub(r'_{2,}', '', text)
    text = text.encode('ascii', 'ignore').decode() #remove character not ascii
    return text


In [13]:
path_to_files = 'D:\\Document-CSDLDPT'

In [14]:
import os


listFile = os.listdir(path_to_files)
listFile

['1  CLASS ACTION SETTLEMENT AGREEMENT AND RELEASE 5.pdf',
 '1 Approved April 9 2010 Revised April 12 2019 AST Guidelines for.pdf',
 '900 History geography and auxiliary disciplines.pdf',
 'A Literature Review on Trade and Informal Labour Markets in Developing Countries.pdf',
 'A-level Philosophy Specification Specification for first teaching.PDF',
 'An overview of scientific and scholarly journal publishing.pdf',
 'AP World History Modern Course and Exam Description Effective.pdf',
 'ASEAN Model Contractual Clauses for Cross Border Data Flows.pdf',
 'Asphalt Art Safety Study.pdf',
 'ATTACK Design and Philosophy subs revision.pdf',
 'Brick by Brick Wikipedia and Libraries building on each other.pdf',
 'California Data Breach Report.pdf',
 'California Pay Data Reporting Portal.pdf',
 'CalPERS Health Program Guide.pdf',
 'Code and Data for the Social Sciences.pdf',
 'Community ART Group Toolkit.pdf',
 'convension de palermo.pdf',
 'Cost of a Data Breach Report 2022.pdf',
 'Data Protectio

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
listText = []
for file in listFile:
    # path = os.path.join('documents', file)
    # print(path)
    text = clean_text(os.path.join(path_to_files, file))
    listText.append(text)
    

In [17]:
tfidfvec = TfidfVectorizer(min_df=2)
X = tfidfvec.fit_transform(listText)

In [18]:
countVec = CountVectorizer()
X2 = countVec.fit_transform(listText)

In [19]:
import pickle

In [20]:
with open('listFile.pickle', 'wb') as file:
    pickle.dump(listFile, file)

with open('listText.pickle', 'wb') as file:
    pickle.dump(listText, file)

with open('tfidfvec.pickle', 'wb') as file:
    pickle.dump(tfidfvec, file)

with open('countVec.pickle', 'wb') as file:
    pickle.dump(countVec, file)

with open('tfidf-file-to-vec.pickle', 'wb') as file:
    pickle.dump(X, file)
    
with open('count-file-to-vec.pickle', 'wb') as file:
    pickle.dump(X2, file)
