In [None]:

from os.path import exists
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import glob
import scipy.sparse
from sklearn.decomposition import TruncatedSVD
import pickle
import numpy as np
from sklearn.preprocessing import normalize


In [None]:
class SearchEngine:


    def __init__(self,number_of_results=10, svd_k = 100,search_svd=True, prepare_necc_files=False):
        if prepare_necc_files:
            self.prepare_files(svd_k)
            
        self.tfidf_matrix = scipy.sparse.load_npz("tfidf_matrix.npz").T
        self.vocab = self.load_vocab()
        self.svd_matrix = np.load("svd_matrix.npz")['svd_matrix']
        self.svd_components = np.load("svd_comps.npz")['svd_comps']
        self.directory = 'parsed_files'
        self.all_files = glob.glob(f"{self.directory}/*")
        self.stemmer = PorterStemmer()
        self.num_of_results = number_of_results

        if search_svd:
            self.search_function = self.search_with_svd
        else: 
            self.search_function = self.search_with_no_svd
    
    def load_vocab(self):
        a_file = open("union.pkl", "rb")
        vocab = pickle.load(a_file)
        a_file.close()
        return vocab
    
    def handle_input(self, inp):
        
        stemmed_words = [self.stemmer.stem(word) for word in inp.split()]
        input_vector = np.zeros(shape=self.tfidf_matrix.shape[0])

        for word in stemmed_words:
            if self.vocab[word]:
                input_vector[self.vocab[word]]+= 1    
        self.search_function(input_vector)


    def prepare_files(self, svd_k):
        all_files = self.all_files

        if not exists("tfidf_matrix.npz") and not exists("svd_matrix.npz") and not exists("union.pkl") and not exists("svd_comps.npz"):
            
            tfidf_vec = TfidfVectorizer(input ='filename')
            matrix = tfidf_vec.fit_transform(all_files)
            
            svd = TruncatedSVD(n_components=svd_k).fit(matrix)
            svd_matrix = svd.transform(matrix)
            svd_components = svd.components_
            
            if not exists("tfidf_matrix.npz"):
                scipy.sparse.save_npz("tfidf_matrix", matrix, compressed=True)

            if not exists("union.pkl"):
                a_file = open("union.pkl", "wb")
                pickle.dump(tfidf_vec.vocabulary_, a_file)
                a_file.close()
            
            if not exists("svd_matrix.npz"):
                np.savez_compressed("svd_matrix", svd_matrix = svd_matrix)

            if not exists("svd_comps.npz"):
                np.savez_compressed("svd_comps", svd_comps=svd_components)


    def search_with_svd(self, input_vector):

        svd_q = self.svd_components @ input_vector
        svd_c = self.svd_matrix @ svd_q

        correlations = [(document_id, svd_c[document_id]) for document_id in range(len(self.all_files))]

        correlations.sort(key=lambda x:x[1], reverse=True)

        for i in range(self.num_of_results):
            print(self.all_files[correlations[i][0]])



    def search_with_no_svd(self, input_vector):

        sparse_x = scipy.sparse.csr_matrix(input_vector)

        def vector_correlation(vec_q, vec_d):
            norm_q = scipy.sparse.linalg.norm(vec_q)
            norm_d = scipy.sparse.linalg.norm(vec_d)
            return (vec_q @ vec_d) / (norm_q * norm_d)

        def result_without_svd(sparse_vec):

            res = []
            for i in range(self.tfidf_matrix.shape[1]):
                x = vector_correlation(sparse_vec, self.tfidf_matrix.getcol(i))
                if len(x) > 0:
                    res.append((i,x))

            res.sort(key=lambda x: x[1], reverse=True)
            for i in res[:15]:
                print(self.all_files[i[0]])
                
        result_without_svd(sparse_x)


In [None]:
se = SearchEngine(search_svd=False)

In [None]:
inp = 'united states election'
se.handle_input(inp)

In [None]:

#server
from http.server import HTTPServer, BaseHTTPRequestHandler, SimpleHTTPRequestHandler

class Serv(BaseHTTPRequestHandler):
    
    def _set_response(self):
        print('set response')
        self.send_response(200)
        self.set_header("Access-Control-Allow-Origin", "*")

        self.send_header('Content-type', 'text/html')

        self.end_headers()

    def do_GET(self):
        print('do get')

        self._set_response()
        # self.wfile.write("GET request for {}".format(self.path).encode('utf-8'))

    def do_POST(self):
        print('do post')
        self._set_response()




httpd = HTTPServer(('localhost',8042),Serv)
httpd.serve_forever()
httpd.server_close()