In [1]:
import pandas as pd
from joblib import Parallel, delayed, cpu_count
import _pickle
import json
import gc
from os.path import join
from os import listdir
from tqdm import tqdm

This script creates split dataframes (so that memory is not an issue, to be later cleaned and merged) from the previous step (strings of JSON stored as lists)

In [12]:
# 1. Set variables
# -------------------

input_folder = join('pickles','1.split_reviews')
save_folder = join('pickles','2.split_review_dfs')


# Define functions 
# -----------------

def create_row(item):
    """Takes a list of json formatted as strings and creates a Pandas dataframe row from that list"""
    row = pd.DataFrame.from_dict(json.loads(item.replace('\n','')),orient='index').transpose()
    return row

def create_frames(review_list,savepath, cpus=cpu_count()-1):
    """
    Concatenates list of reviews into a pandas dataframe and saves a pickled object to savepath
    review_list: a list of JSON objects formatted as strings
    cpus: Number of processes to run, defaults to number of cores minus 1
    """
    # Creates a list of dataframe rows to be concatenated
    frames = Parallel(n_jobs=cpus)(delayed(create_row)(review) for review in tqdm(review_list))
    
    print("Concatenating frames to a pandas dataframe")
    df = pd.concat(frames)
    
    print("Pickling dataframe to "+ savepath)
    _pickle.dump(df,open(savepath,'wb'))
    
    print("Clearing memory")
    del frames,review_list, df
    gc.collect()
    
    print('Review dataframe part created')


# 3. Main script
# -----------------------

# A. Load relevant files
input_filelist = [file for file in listdir(input_folder)]
lor_agg = [_pickle.load(open(join(input_folder,file),'rb')) for file in input_filelist]
num_splits = len(lor_agg)

# B. Pickle dataframes to disk
for i in range(num_splits):
    create_frames(lor,join(save_folder,'reviewdf{}_{}.pkl'.format(str(i+1),str(num_splits))))