In [1]:
import numpy as np
import pandas as pd
import multiprocessing as mp

np.random.seed(0)

In [2]:
df = pd.read_csv('data/medium.csv')
prep_df = pd.read_csv('data/prep_df.csv').applymap(str)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy

for_title = TfidfVectorizer(min_df=2)
for_subTitle = TfidfVectorizer(min_df=3)
for_text = TfidfVectorizer(min_df=5)

tfidf_df = scipy.sparse.hstack([
    for_title.fit_transform(prep_df['title']),
    for_subTitle.fit_transform(prep_df['subTitle']),
    for_text.fit_transform(prep_df['text']),
]).tocsr()

In [4]:
import pickle
from pathlib import Path
Path("models").mkdir(exist_ok=True)

def dump(model, in_str):
    with open('models/' + in_str + '.pickle', 'wb') as handle:
        pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)

dump(for_title, 'for_title')
dump(for_subTitle, 'for_subTitle')
dump(for_text, 'for_text')

In [6]:
from encoders import text_to_vec

def app_text_to_vec(X):
    X = X.copy()
    col_names = ['title', 'subTitle', 'text']
    col_indexed = [[col + '_' + str(i) for i in range(300)] 
                   for col in col_names]

    with mp.Pool(6) as pool:
        dfs = []
        for i in range(len(col_names)):
            dfs.append(pd.DataFrame(
                pool.map(text_to_vec, X[col_names[i]]), 
                index=X.index, 
                columns=col_indexed[i]
            ))      
       
    return pd.concat(dfs, axis=1)

In [7]:
w2v_df = app_text_to_vec(prep_df)

In [8]:
from scipy.sparse import save_npz
save_npz('data/tfidf_df.npz', tfidf_df)
w2v_df.to_csv('data/w2v_df.csv', index=False)