In [3]:
import pandas as pd
import _pickle
import gc
from os.path import join
from os import listdir
from tqdm import tqdm

This script cleans the pickled dataframes from previous steps to reduce memory usage so the review dataframe can be pickled to disk

In [25]:
# Shows progress bar when using pandas apply
tqdm.pandas(desc='progress bar')

save_folder = join('pickles','3.agg_dfs')

In [None]:
# Functions for scripts

def flatten_votes(df):
    """Flattens the vote column in the review dataframe"""
    votes = df.votes.apply(pd.Series)
    new_df = pd.concat([df,votes], axis=1)
    new_df = new_df.drop('votes', axis=1)
    new_df = new_df.rename(columns={'cool':'review_cool',
                          'funny':'review_funny',
                          'useful':'review_useful'})
    return new_df


def col_to_small_int(df,columns):
    """Changes columny type for selected columns to uint8 to reduce memory usage"""
    for col in columns:
        df[col] = df[col].astype('uint8')
    return df


def clean_df(df):
    """Amend column datatypes and cleans dataframe to reduce memory usage"""
    # Change small int columns to uint8 to save memory
    df = col_to_small_int(df,['review_cool','review_funny','review_useful','stars'])
    
    # Change date to datetime dtype
    df['date'] = pd.to_datetime(df['date'])
    
    # Drop type column as no variance
    df.drop('type',axis=1, inplace=True)
    
    # Reset the index for easier reference
    df.reset_index(inplace=True)
    df.drop('index',axis=1, inplace=True)
    
    return df

In [6]:
# Load relevant files
files_to_load = [file for file in listdir('pickles') if 'reviewdf' in file]
reviewdf_list = [_pickle.load(open(join('pickles',file),'rb')) for file in files_to_load]

# Create review_data dataframe
reviewdf_list = [flatten_votes(df) for df in tqdm(reviewdf_list)]
review_data = pd.concat([df for df in reviewdf_list])

# Clean the dataframe
review_data = clean_df(review_data)

# Clean memory
del reviewdf_list, files_to_load
gc.collect()

In [24]:
# Pickle the resulting dataframe
_pickle.dump(review_data,open(join(save_folder,'review_data.pkl'),'wb'))