In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

input_path = 'dataset/toy_3-core_80_20_with_text.csv'
output_path = 'dataset/toy_embdded_review.csv'

def load_text(path):
    data = []
    with open(path) as f:
        lines = f.readlines()
        for i in range(1, len(lines)):
             record = {}
             line = lines[i].split(',')
             record['reviewerID'] = line[0]
             record['asin'] = line[1]
             record['rating'] = line[2]
             record['review'] = "".join(line[3:])
             data.append(record)
    return data

def get_pure_text(review):
    return review['review']

def get_review_info(review):
    review = review.copy()
    review.pop('review')
    return review

def TF_IDF_vectorization(input_path, output_path):
    
    reviews = load_text(input_path)
    df_info = pd.DataFrame(list(map(get_review_info, reviews)))
    pure_text = list(map(get_pure_text, reviews))
    assert (df_info.shape[0] == len(pure_text))

    vectorizer = TfidfVectorizer(max_features=20)
    X = vectorizer.fit_transform(pure_text)

    df_review = pd.DataFrame.sparse.from_spmatrix(X)
    df = pd.concat([df_info, df_review], axis=1)
    df.to_csv(output_path)

In [None]:
for t in ['train', 'val', 'test', 'all']:
    input_path = 'dataset/processed_3-core_80_20_{t}_with_text.csv'
    output_path = 'dataset/{t}_embdded_review.csv'
    TF_IDF_vectorization(input_path, output_path)
    print(f'{t} dataset processed')