In [153]:
import nltk
from collections import Counter
import numpy as np
import json
from pathlib import Path
import pickle
from sklearn.decomposition import TruncatedSVD

from scipy import sparse

In [130]:
stopwords = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.stem.porter.PorterStemmer()

In [131]:
def read_file(filepath):
    with open(filepath) as f:
        data = json.load(f)
        return data['text'], data['id']

In [132]:
def process_text(text: str) -> dict:
    """For a given text returns bag of words (a dict)"""

    text = text.lower()
    words = nltk.tokenize.word_tokenize(text)
    words = [w for w in words if w not in stopwords]
    words = [stemmer.stem(w) for w in words]
    words = [w for w in words if len(w)>2]

    return dict(Counter(words))

In [150]:
def build_tbd_matrix(docs: list[dict]) -> sparse.coo_array: 
    """Given bags of words (dicts) return term by document matrix."""
    terms = {}
    row  = []
    col = []
    data = []
    for doc_index, doc in enumerate(docs):
        for term, count in doc.items():
            term_index = terms.setdefault(term, len(terms))
            row.append(term_index)
            col.append(doc_index)
            data.append(count)

    return sparse.coo_array((data, (row, col)), shape=(len(terms), len(docs))), terms

In [139]:
def mul_by_idf(matrix) -> sparse.coo_array:
    """Multiplies given matrix by idf vector."""
    idf = np.log(matrix.shape[1]/(matrix>0).sum(axis=1))
    return matrix.multiply(idf)

In [143]:
def normalize_col(matrix) -> sparse.coo_array:
    """Normalizes columns so that each feature col has equal l2 norm."""
    return matrix.multiply(1/sparse.linalg.norm(matrix, axis=0))

In [144]:
def save(**kwargs):
    """Saves things using pickle."""
    
    if "tbd_matrix" in kwargs:
        matrix = kwargs["tbd_matrix"].tocsr()
        sparse.save_npz('tbd_matrix.npz', matrix)
    for name, thing in kwargs.items():
        with open(f'{name}.pickle', "wb") as f:
            pickle.dump(thing, f)

In [169]:
def load(*args):
    """Loads things from pickle files."""
    loaded = []
    for name in args:
        with open(f'{name}.pickle', "rb") as f:
            loaded.append( pickle.load(f))
        
    return loaded

Processing

In [44]:
def processed_docs(n):
    filenames = Path("PlainTextWikipedia\data").glob('*')
    i = 0
    for filename in filenames:
        try:
            txt = read_file(filename)
        except Exception as e:
            print("Problem with reading file: ", e)
            continue
        try:
            file = process_text(txt)
        except Exception as e:
            print("Problem with reading file: ", e)
            continue
        yield file
        i+=1
        if i>=n:
            break 

In [45]:
docs = list(processed_docs(20))

In [232]:
tbd_matrix, terms = build_tbd_matrix(docs)
tbd_matrix = mul_by_idf(tbd_matrix)
tbd_matrix = normalize_col(tbd_matrix)
save(tbd_matrix, terms)

ValueError: inconsistent shapes

In [71]:
m, t = load()

In [192]:
def lower_matrix_approx(matrix, k):
    """svd"""
    svd = TruncatedSVD(n_components=k).fit(matrix.T)
    low_rank_matrix = svd.transform(matrix.T)
    return low_rank_matrix.T, svd

Querying

In [147]:
def query_text_to_vector(txt, terms) -> sparse.coo_array:
    words = process_text(txt).keys()
    cords = [terms[w] for w in words if w in terms] 
    query_vector = sparse.coo_array( 
            ([1]*len(cords), (cords, [0]*len(cords))), 
            shape=(len(terms), 1)
        )
    query_vector = query_vector/sparse.linalg.norm(query_vector)
    return query_vector

In [210]:
def get_similar_documents(matrix, query_vector, svd=None):
    query_vector = query_vector.todense()
    if svd:
        query_vector = svd.transform(query_vector.T).T
    similarities = query_vector.T @ matrix
    k
    return similarities

In [148]:
v = query_text_to_vector("hi this is unfortunate air force one", t)

In [248]:
def svd(matrix, k):
    """svd"""
    return sparse.linalg.svds(matrix.T, k=k)


def lower_rank(svd, k):
    """Lower rank matrix approximation."""
    u, s, vt = svd
    return u[:, :k] @ np.diag(s[:k]) @ vt[:k, :]


def svd_transform_vector(vector, svd, k):
    """Svd transform vector."""
    u, s, vt = svd
    return vt[:k, :] @ vector

In [249]:
svd = svd(m, 10)

In [260]:
lower_rank(svd,10).shape

(20, 2285)