In [258]:
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.linalg import svds

from IPython.display import clear_output
from tqdm.notebook import tqdm

In [280]:
class Vectorizer():
    def __init__(self, corp_path):
        self.corp_path = corp_path
    
    def upload_corp(self):
        with open(self.corp_path, 'r') as f:
            self.corp = list(set(f.read().split('\n')[:-1]))
        
        self.corp = [text.replace(',', ' ') for text in self.corp]
    
    
    def log(self, part):
        clear_output(wait=True)
        print(f'{part} is processing')
        
    def make_tf_idf_matrix(self, token_pattern=None):
        if token_pattern:
            self.tfidf = TfidfVectorizer(token_pattern=token_pattern)
        else:
            self.tfidf = TfidfVectorizer()
            
        self.A = self.tfidf.fit_transform(self.corp)
        self.feature_list = self.tfidf.get_feature_names_out()
        
    def make_svd(self, n=30):
        self.u, self.sigma, self.vT = svds(self.A, n)
        self.singular_indicies = np.argsort(-self.sigma)
        
        self.u = self.u[:, self.singular_indicies]
        self.sigma = np.diag(self.sigma[self.singular_indicies])
        self.vT = self.vT[self.singular_indicies, :]
        
        self.embedded_matrix = self.sigma@self.vT
        
        self.words_embedding_dict = dict(zip(self.feature_list, self.embedded_matrix.T))
    
    def get_emb_dict(self):
        
        self.log('Upload')
        self.upload_corp()
        self.log('TfIdf')
        self.make_tf_idf_matrix()
        self.log('SVD')
        self.make_svd(n=300)
        
        return self.words_embedding_dict

In [281]:
vect = Vectorizer('./clean_corp_wiki_pa.txt')
emb_dict = vect.get_emb_dict()

SVD is processing


In [282]:
emb_dict

{'ਅਅਤ': array([-1.27313924e-03,  6.94473008e-05, -6.88329240e-05,  1.53903319e-03,
        -1.09801517e-04, -7.01990841e-04, -1.24940636e-03, -1.38337382e-03,
         3.26092601e-04, -6.97109821e-05, -3.33869171e-04,  5.93370080e-05,
        -7.47430395e-04, -1.19843663e-03, -2.67774430e-04,  3.42030678e-04,
         2.55820935e-04,  5.64238543e-05,  4.58379614e-04, -1.43771291e-04,
        -7.51744925e-04,  1.27251229e-04, -7.51203466e-05,  7.74242319e-04,
        -1.86991027e-05,  3.09159447e-04, -3.14522628e-04, -9.99091002e-04,
        -4.65541263e-04, -6.60721798e-04,  1.22788889e-03,  6.63089760e-05,
        -8.88293316e-04, -2.26689617e-03,  8.74498571e-04, -2.28299950e-03,
        -5.11137239e-04, -1.14079618e-03, -9.41114514e-04, -5.19480768e-04,
         4.17173834e-04, -2.86342408e-04,  1.14997963e-03, -4.98283038e-04,
        -4.59070185e-04,  2.56992572e-04, -2.41617944e-04, -4.81294053e-05,
        -8.68258934e-04,  4.50553501e-04,  1.10334993e-03,  5.37321963e-04,
     

In [286]:
len(vect.vT[1,:])

23117