### Web Crawler


In [1]:
import requests as req
from urllib.parse import urljoin
import bs4
import logging

logging.basicConfig(
    format='%(asctime)s %(levelname)s:%(message)s',
    level=logging.INFO)

class Crawler:

    def __init__(self, urls=[], file_counter=0, num_of_files=int(5e4)):
        self.visited_urls = {}
        self.urls_to_visit = urls
        self.file_counter = file_counter
        self.start_of_url = 'https://en.wikipedia.org'
        self.num_of_files= num_of_files


    def download_url(self, url):
        return req.get(url).text

    def write_text_to_file(self, article, f_name):
        if article:

            self.file_counter += 1

            path = 'DIR/' + str(f_name[6:]) + '.txt'

            with open(path, 'w+', encoding='utf-8') as f:

                f.write(article.text)



    def get_linked_a(self,html, f_name):
        
        soup = bs4.BeautifulSoup(html, 'html.parser')
        article = soup.find('div', {"class":"mw-parser-output"})

        self.write_text_to_file(article, f_name)

        a = {}
        for link in soup.find_all('a'):
            
            path = link.get('href')
            if path and path.startswith('/wiki') and path[6:11] != 'File:':
                a[path] = path

        return a


    def crawl(self, url, f_name):
        
        html = self.download_url(url)

        for singleAtag in self.get_linked_a(html, f_name).values():

            if singleAtag and (len(self.urls_to_visit) < 130)  and (singleAtag not in self.urls_to_visit) and (singleAtag not in self.visited_urls):
                self.urls_to_visit.append(singleAtag)




    def run(self):

        while self.urls_to_visit and self.file_counter<self.num_of_files:

            tmp = self.urls_to_visit.pop(0)
            url = str(self.start_of_url) + str(tmp)
            try:
                logging.info(f'Crawling: {url}')
                self.visited_urls[tmp] = 1
                self.crawl(url,tmp)
            except Exception:
                logging.exception(f'Failed to crawl: {url}')

### File paser


In [2]:
from os.path import exists
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import codecs
import os


class file_parser:


    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        self.directory = 'DIR' 


    def stem_text(self, text):
        """
            Stem text
            Delete stop words in English
            Limit articles to 10k words
            Include word only if has more than 2 letter
            Include word if contains only letters and numbers 
        """

        def filter_words(word):
            return not (word in self.stop_words)
        
        def is_correct(word):
            return len(word)>2 and word.isalpha()

            
        tokens = word_tokenize(text)
        word_limit_per_article = int(1e4) 
        words_list = []
        
        for i, word in enumerate(tokens):

            if filter_words(word) and is_correct(word):
                words_list.append(word)

            if i > word_limit_per_article:
                break

        for i, w in enumerate(words_list):
            words_list[i] = self.stemmer.stem(w)
            
        res = " ".join(words_list)
        
        return res


    #open file and write parsed text
    def parse_file(self, file_name, new_name):
        res = ""
        with codecs.open(file_name,'r', errors="ignore", encoding="utf-8") as f:
            for line in f.readlines():
                if not line.isspace():
                    res += line

        res1 = self.stem_text(res)
        
        with codecs.open(new_name, 'w+', errors="ignore", encoding="utf-8") as f:
            f.write(res1)
        
        

    def run(self):
        parsed_files = 'parsed_files/'
        suffix = '.txt'

        for i, filename in enumerate(os.listdir(self.directory)):
            file_name = os.path.join(self.directory, filename)
            new_name = parsed_files + str(i)+"@"+filename

            if exists(file_name):
                self.parse_file(file_name, new_name)



Search Engine

In [42]:
from os.path import exists
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import glob
import scipy.sparse
from sklearn.decomposition import TruncatedSVD
import pickle
import numpy as np
from sklearn.preprocessing import normalize


class SearchEngine:


    def __init__(self,number_of_results=10, svd_k = 100,search_svd=True, prepare_necc_files=False):
        if prepare_necc_files:
            self.prepare_files(svd_k)
            
        self.tfidf_matrix = scipy.sparse.load_npz("matrixes/tfidf_matrix.npz").T
        self.vocab = self.load_vocab()

        self.directory = 'parsed_files'
        self.all_files = glob.glob(f"{self.directory}/*")
        self.stemmer = PorterStemmer()
        self.num_of_results = number_of_results

        if search_svd:
            self.svd_matrix = np.load("matrixes/svd_matrix.npz")['svd_matrix']
            self.svd_components = np.load("matrixes/svd_comps.npz")['svd_comps']
            self.search_function = self.search_with_svd
        else: 
            self.search_function = self.search_with_no_svd
    
    


    def load_vocab(self):
        a_file = open("matrixes/union.pkl", "rb")
        vocab = pickle.load(a_file)
        a_file.close()
        return vocab
    


    def handle_input(self, inp):
        
        stemmed_words = [self.stemmer.stem(word) for word in inp.split()]
        input_vector = np.zeros(shape=self.tfidf_matrix.shape[0])

        for word in stemmed_words:
            if self.vocab[word]:
                input_vector[self.vocab[word]]+= 1    
        return self.search_function(input_vector)

    def search(self, input):
        return self.handle_input(input)


    def prepare_files(self, svd_k):
        all_files = self.all_files

        if not exists("matrixes/tfidf_matrix.npz") \
                    and not exists("matrixes/svd_matrix.npz") \
                    and not exists("matrixes/union.pkl") \
                    and not exists("matrixes/svd_comps.npz"):
            
            tfidf_vec = TfidfVectorizer(input ='filename')
            matrix = tfidf_vec.fit_transform(all_files)
            
            svd = TruncatedSVD(n_components=svd_k).fit(matrix)
            svd_matrix = svd.transform(matrix)
            svd_components = svd.components_
            
            if not exists("matrixes/tfidf_matrix.npz"):
                scipy.sparse.save_npz("matrixes/tfidf_matrix", matrix, compressed=True)

            if not exists("matrixes/union.pkl"):
                a_file = open("matrixes/union.pkl", "wb")
                pickle.dump(tfidf_vec.vocabulary_, a_file)
                a_file.close()
            
            if not exists("matrixes/svd_matrix.npz"):
                np.savez_compressed("matrixes/svd_matrix", svd_matrix = svd_matrix)

            if not exists("matrixes/svd_comps.npz"):
                np.savez_compressed("matrixes/svd_comps", svd_comps=svd_components)



    def search_with_svd(self, input_vector):

        svd_inp = self.svd_components @ input_vector
        svd_q = self.svd_matrix @ svd_inp

        res = [(document_id, svd_q[document_id]) for document_id in range(len(self.all_files))]
        res.sort(key=lambda x:x[1], reverse=True)
        
        print("with svd")
        for i in range(self.num_of_results):
            print(self.all_files[res[i][0]])
            
        return res[:self.num_of_results]



    def search_with_no_svd(self, input_vector):

        sparse_vec = normalize(scipy.sparse.csr_matrix(input_vector))
        matrix = normalize(self.tfidf_matrix, axis=0)

        res = sparse_vec @ matrix
        x = []
        for i in range(res.shape[1]):
            tmp = res.getcol(i).data
            if len(tmp)>0:
                x.append((i,tmp[0]))

        x.sort(key=lambda x: x[1], reverse=True)
        print("without svd")
        for i in range(self.number_of_results):
            print(self.all_files[x[i][0]])

        return x[:self.num_of_results]
        


In [43]:
se = SearchEngine()
# se1 = SearchEngine()

In [44]:
if __name__ == '__main__':
    # clawler = Crawler(urls=['/wiki/Tea'])
    # clawler.run()

    # file_parser = file_parser()
    # file_parser.run()

    inp = 'god is love god is life'
    se.search(inp)
    del se

with svd
parsed_files\6323@Christian_theology.txt
parsed_files\15310@God_in_Christianity.txt
parsed_files\15319@God_the_Father#Christianity.txt
parsed_files\15320@God_the_Father.txt
parsed_files\37858@Son_of_God_(Christianity).txt
parsed_files\28950@Nontrinitarianism.txt
parsed_files\6283@Christianity.txt
parsed_files\15309@God_in_Abrahamic_religions.txt
parsed_files\17517@Holy_Trinity.txt
parsed_files\41775@Trinity.txt


In [72]:
from http.server import HTTPServer, BaseHTTPRequestHandler, SimpleHTTPRequestHandler
import socketserver
import json
import urllib

class CORSHTTPRequestHandler(SimpleHTTPRequestHandler):
    def do_GET(self):
        parsed_url = urllib.parse.urlparse(self.path)
        request_uri = parsed_url.path
        request_params = urllib.parse.parse_qs(parsed_url.query)
        print(parsed_url)
        print(request_uri)
        print(request_params)
        with open("file.txt", 'rb') as f:
            self.send_header('Access-Control-Allow-Origin', 'Access-Control-Allow-Origin')
            self.end_headers()
            self.send_header('Access-Control-Allow-Origin', '*')
            self.end_headers()
            self.send_header('Access-Control-Allow-Methods', 'GET')
            self.end_headers()
            
            self.wfile.write(b'hello czesc')
            self.send_response(200)
        return

Handler = CORSHTTPRequestHandler
httpd = socketserver.TCPServer(('localhost',80),Handler)
try: 
    httpd.serve_forever()
except KeyboardInterrupt:
    httpd.server_close()

ParseResult(scheme='', netloc='', path='/', params='', query='data=iufhbqwif+wf+qf+qw', fragment='')
/
{'data': ['iufhbqwif wf qf qw']}


127.0.0.1 - - [18/May/2022 00:00:03] "GET /?data=iufhbqwif+wf+qf+qw HTTP/1.1" 200 -


In [63]:
del httpd