# Importing Yelp Dataset

Author(s): Brian Lin, Yuqi Jiao (Anthony)

This notebook is for checking for combining all preprocessed data so that we obtain
the original 11 features of Lee et. al.

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
CWD = Path.cwd()
ROOT = CWD.parent
# path to processed data dir
PROC_PATH = CWD/"processed_data"
OUTPUT = PROC_PATH / "preprocessed.parquet.snappy"

In [3]:
main_df = pd.read_parquet(PROC_PATH/"joined.parquet.snappy")
is_english = pd.read_parquet(PROC_PATH/"joined_data_lang_detected.parquet", columns=["r_id", "is_english"])
is_english = is_english.astype({"r_id": int})
linguistic = pd.read_parquet(PROC_PATH/"joined_linguistic_extra.parquet.snappy")

# we assume that the latest review date is the last date of data collection.
# this assumption is supported by the fact that the newest user is also on this
# date. it seems that no more new data is obtained after this date.

# needs to be obtained before excluding invalid rows.
last_record_date = main_df.r_date.max()
last_record_date_id = main_df.r_id[main_df.r_date.argmax()]

In [4]:
# exclude non-english (values that = 1)
to_exclude = set(is_english.r_id[(is_english.is_english != 0)])
# negative useful votes
to_exclude = to_exclude.union(set(main_df.r_id[main_df.r_useful < 1]))
# reviews older than user
to_exclude = to_exclude.union(
    set(main_df.r_id[main_df.r_date <= main_df.u_yelping_since]))

print(f"excluding: {len(to_exclude)} records")
main_df = main_df[~main_df.r_id.isin(to_exclude)]


excluding: 872 records


In [5]:
# inner join on r_id
main_df = main_df.merge(linguistic, on="r_id", validate="1:1")
print(f"correct n rows: {main_df.shape[0] + len(to_exclude) == linguistic.shape[0]}")

correct n rows: True


In [6]:
# obtain elapsed month since last data collection date.
month_diff = (last_record_date - main_df.r_date) / np.timedelta64(1, 'M')
# bool mask (need to do this since the indices after exclusion do not match up)
is_last_record_date = main_df.r_id == last_record_date_id
# we add the last record_date by a small number so we don't divide by 0.
month_diff[is_last_record_date] += month_diff[~is_last_record_date].min()

# scale votes by elapsed months
main_df.r_useful /= month_diff
main_df.r_funny /= month_diff
main_df.r_cool /= month_diff

In [7]:
# calculate elapsed month since creating account, relative to review post date
u_month_age = (main_df.r_date - main_df.u_yelping_since) / np.timedelta64(1, 'M')

main_df["u_month_age"] = u_month_age

In [8]:
# remove unneeded cols
main_df = main_df.drop(columns=["b_id", "r_funny", "r_cool", "u_id", "r_date", "r_text", "u_yelping_since"])

In [9]:
# Make column order same as Lee et. al
col_order = ["r_id", "r_stars", "r_stars_square", "r_length",
    "u_friends_count", "u_review_count", "u_month_age",
    "b_stars", "b_review_count",
    "r_sen", "r_sub", "r_rea", "r_useful"]
main_df = main_df[col_order]
main_df.head()

Unnamed: 0,r_id,r_stars,r_stars_square,r_length,u_friends_count,u_review_count,u_month_age,b_stars,b_review_count,r_sen,r_sub,r_rea,r_useful
0,4,5,25,43,1,9,11.538455,4.0,181,0.305871,0.775379,81.59,0.01183
1,9,3,9,103,1,24,5.092347,4.5,13,0.027222,0.558611,76.15,0.014351
2,11,5,25,179,7,39,76.740056,2.5,8,0.280583,0.48997,70.43,0.02532
3,18,4,16,109,112,74,60.558653,4.0,398,0.130754,0.289683,78.59,0.011194
4,19,5,25,78,8,27,48.32233,4.0,55,0.236861,0.368667,77.06,0.028382


In [10]:
# save output
main_df.to_parquet(OUTPUT)