## Feature Engineering - Venue

In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm
from joblib import Parallel, delayed

In [2]:
data_tran = pd.read_json('data2/data_tran.json', orient='records', lines=True)
data_test = pd.read_json('data2/data_test.json', orient='records', lines=True)

n_tran = data_tran.shape[0]
n_test = data_test.shape[0]

In [3]:
def get_venue_dict(data):
    
    num_venues=466
    vector_size=21246

    venue_dict = {venue: np.zeros(vector_size, dtype=int) for venue in range(num_venues)}

    for _, row in tqdm(data.iterrows(), total=len(data)):
        venue = row['venue'] 
        authors = row['authors'] 

        for author_id in authors:
            if author_id < 21246: 
                venue_dict[venue][author_id] += 1

    return venue_dict

venue_dict = get_venue_dict(data_tran)

100%|██████████| 8460/8460 [00:00<00:00, 19711.09it/s]


In [4]:
def get_venue_array(venue, venue_dict):
    return np.array(venue_dict[venue][:100]).reshape(1, 100)

In [5]:
def get_venue_matrix(data, venue_dict):
    vectors_list = Parallel(n_jobs=-1)(delayed(get_venue_array)(row['venue'], venue_dict) for _, row in tqdm(data.iterrows(), total=len(data)))
    return np.concatenate(vectors_list, axis=0)

In [6]:
x_tran_venue = get_venue_matrix(data_tran, venue_dict)
x_test_venue = get_venue_matrix(data_test, venue_dict)

np.save('data2/x_tran_venue_a.npy', x_tran_venue)
np.save('data2/x_test_venue_a.npy', x_test_venue)

100%|██████████| 8460/8460 [31:09<00:00,  4.52it/s]
100%|██████████| 800/800 [02:49<00:00,  4.72it/s]
