In [4]:
import pandas as pd
import _pickle
from os.path import join
from tqdm import tqdm, tqdm_notebook
import gc

In [5]:
tqdm.pandas(tqdm_notebook)

In [6]:
# 1 . Variables
# --------------

input_folder = 'raw_data'
save_folder = join('pickles','3.agg_dfs')

business_data_inputpath = join(input_folder,'yelp_academic_dataset_business.json')
checkin_data_inputpath = join(input_folder,'yelp_academic_dataset_checkin.json')
tip_data_inputpath = join(input_folder,'yelp_academic_dataset_tip.json')
user_data_inputpath = join(input_folder,'yelp_academic_dataset_user.json')
photo_data_inputpath = join(input_folder,'photo_id_to_business_id.json')


# 2. Load dataframes
# -------------------

business_data = pd.read_json(business_data_inputpath,lines=True)
checkin_data = pd.read_json(checkin_data_inputpath,lines=True)
tip_data = pd.read_json(tip_data_inputpath,lines=True)
user_data = pd.read_json(user_data_inputpath,lines=True)
photo_data = pd.read_json(photo_data_inputpath,lines=True)

In [135]:
# 3. Define functions
# -------------------


def convert_to_small_int(df,columns):
    """Converts dataframe column to small integers"""
    for col in columns:
        df[col] = df[col].astype('uint8')
    return df


def amend_business_data(df):
    """
    Changes type for the business dataset to reduce memory use
    Explodes attributes
    Explodes hours
    """
    # Change types to reduce memory usage
    # -----------------------------------
    
    df = convert_to_small_int(df,['review_count'])
    df['stars'] = df['stars'].astype('float32')
    df.drop('type', axis=1, inplace=True)
    df['state'] = df['state'].astype('category')
    
    
    # Explode attributes
    # -------------------
    
    attributes = df['attributes'].progress_apply(pd.Series)
    df = pd.concat([df,attributes], axis=1)
    
    
    # Explode hours
    # --------------
    
    hours = df['hours'].progress_apply(pd.Series)
    hours = hours.rename(columns = {'Friday': 'hours_Friday',
                        'Monday': 'hours_Monday',
                        'Saturday': 'hours_Saturday',
                        'Sunday': 'hours_Sunday',
                        'Thursday': 'hours_Thursday',
                        'Tuesday': 'hours_Tuesday',
                        'Wednesday': 'hours_Wednesday'
                       })
    df = pd.concat([df,hours],axis=1)
    
    # Note - Leaving categories as is because there are over 1,000 unique categories
    return df    


def amend_checkin_data(df):
    """Explodes checkin info"""
    checkin = df['checkin_info'].progress_apply(pd.Series)
    df = pd.concat([df,checkin],axis=1)
    df.drop('type',axis=1,inplace=True)
    del checkin
    return df


def amend_tip_data(df):
    """Changes types for tip data"""
    df['likes'] = df['likes'].astype('uint8')
    df.drop('type',axis=1,inplace=True)
    return df


def amend_photo_data(df):
    """Amends the photo dataset to reduce memory usage and transpose it properly"""
    photodata = df.transpose()[0].progress_apply(pd.Series)
    photodata['label'] = photodata['label'].astype('category')
    return photodata


def amend_user_data(df):
    """Amends user data dataframe to extract features and reduce memory usage"""
    
    # Explode compliments
    # --------------------
    
    compliments = df.compliments.progress_apply(pd.Series)
    
    # Add prefix to compliments column name
    compliments = compliments.rename(columns={key:'compliment_{}'.format(key) for key in compliments.columns.values})
    
    df = pd.concat([df,compliments], axis=1)
    
    del compliments
    gc.collect()
    
    
    # Amend types
    # ------------
    
    df.average_stars = df.average_stars.astype('float32')
    df.fans = df.fans.astype('uint16')
    df.review_count = df.review_count.astype('uint16')
    df.yelping_since = df.yelping_since.astype('datetime64')
    
    
    # TODO - If important at some point, explode elite and extract features
    
    
    # Extract number of friends
    # -------------------------
    
    df['num_friends'] = df.friends.apply(len)
    df['num_friends'] = df['num_friends'].astype('uint16')
    
    
    # Drop type column
    # ----------------
    
    df.drop('type',axis=1, inplace=True)
    
    
    # Explode votes
    # --------------
    
    votes = df.votes.progress_apply(pd.Series)
    # Add prefix to column name to identify votes
    votes = votes.rename(columns={key:'votes_{}'.format(key) for key in votes.columns.values})
    votes = votes.astype('uint32')
    
    df = pd.concat([df,votes], axis=1)
    
    
    # Extracting Yelp tenure
    # ----------------
    
    end_date = df.yelping_since.max()
    df['yelp_tenure'] = pd.Series(end_date - df.yelping_since)
    
    return df

In [6]:
# 4. Run functions to amend data
# ---------------------------

business_data = amend_business_data(business_data)
checkin_data = amend_checkin_data(checkin_data)
tip_data = amend_tip_data(tip_data)
photo_data = amend_photo_data(photo_data)
user_data = amend_user_data(user_data)

100%|██████████| 85901/85901 [00:32<00:00, 2653.34it/s]
100%|██████████| 85901/85901 [00:29<00:00, 2924.74it/s]


In [136]:
# 5. Pickle dataframes
# ---------------------

_pickle.dump(business_data,open(join(save_folder,'business_data.pkl'),'wb'))
_pickle.dump(checkin_data,open(join(save_folder,'checkin_data.pkl'),'wb'))
_pickle.dump(tip_data,open(join(save_folder,'tip_data.pkl'),'wb'))
_pickle.dump(photo_data,open(join(save_folder,'photo_data.pkl'),'wb'))
_pickle.dump(user_data,open(join(save_folder,'user_data.pkl'),'wb'))