In [1]:
from pymongo import MongoClient
from tqdm.notebook import tqdm
import numpy as np
import pickle
import os

conn = MongoClient()


process_fibvid = conn.process_fibvid
tweets_st  = process_fibvid.tweets_st
tweets_bert  = process_fibvid.tweets_bert

In [2]:
def process(coll, **extractors):
    total = coll.estimated_document_count({})
    tweetIds = []
    embs = {k: [] for k in extractors.keys()}
    with conn.start_session() as session: 
        for tweet in tqdm(coll.find(no_cursor_timeout=True, 
                                            session=session), total=total):
            tweetIds.append(tweet['tweetId'])
            for k, e in extractors.items():
                embs[k].append(e(tweet))
    embs = {k: np.asarray(v) for k, v in embs.items()}
    tweetIds = {v: i for i, v in enumerate(tweetIds)}
    return tweetIds, embs

In [3]:
tweetId, embs = process(tweets_st, embs=lambda x: np.asarray(x['embs']))

  0%|          | 0/299118 [00:00<?, ?it/s]

In [4]:
if not os.path.exists('st_embs'):
    os.makedirs('st_embs')

np.savez_compressed('st_embs/embs.npz', **embs)
with open('st_embs/tweetId_pos.pickle', 'wb') as f:
    pickle.dump(tweetId, f)
del tweetId
del embs 

In [5]:
tweetId, embs = process(tweets_bert, pooler_output=lambda x: np.asarray(x['pooler_output']), 
                        last_hidden_state=lambda x: np.copy(np.asarray(x['last_hidden_state'])[-1,:]))

  0%|          | 0/299118 [00:00<?, ?it/s]

In [6]:
if not os.path.exists('bert_embs'):
    os.makedirs('bert_embs')

np.savez_compressed('bert_embs/pooler_output.npz', embs=embs['pooler_output'])
np.savez_compressed('bert_embs/last_hidden_state.npz', embs=embs['last_hidden_state'])
with open('bert_embs/tweetId_pos.pickle', 'wb') as f:
    pickle.dump(tweetId, f)
del tweetId
del embs 

In [7]:
print('Ok.')

Ok.
