# Importing Yelp Dataset

Author(s): Brian Lin, Yuqi Jiao (Anthony)

This notebook is for checking for combining all preprocessed data so that we obtain
the original 11 features of Lee et. al.

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
CWD = Path.cwd()
ROOT = CWD.parent
# path to processed data dir
PROC_PATH = CWD/"processed_data"
# path to experiment-ready data
READY_DATA_DIR = CWD.parent/"ready_data"

RANDOM_SEED = 2 # for reproducibility

In [3]:
main_df = pd.read_parquet(PROC_PATH/"joined.parquet.snappy")
is_english = pd.read_parquet(PROC_PATH/"joined_data_lang_detected.parquet", columns=["r_id", "is_english"])
is_english = is_english.astype({"r_id": int})
linguistic = pd.read_parquet(PROC_PATH/"joined_linguistic_extra.parquet.snappy")

In [4]:
# exclude non-english (values that = 1)
to_exclude = set(is_english.r_id[(is_english.is_english != 0)])
# negative useful votes
to_exclude = to_exclude.union(set(main_df.r_id[main_df.r_useful < 1]))
# reviews older than user
to_exclude = to_exclude.union(
    set(main_df.r_id[main_df.r_date <= main_df.u_yelping_since]))

print(f"excluding: {len(to_exclude)} records")
main_df = main_df[~main_df.r_id.isin(to_exclude)]

excluding: 872 records


In [5]:
# inner join on r_id
main_df = main_df.merge(linguistic, on="r_id", validate="1:1")
print(f"correct n rows: {main_df.shape[0] + len(to_exclude) == linguistic.shape[0]}")

correct n rows: True


In [6]:
# calculate elapsed month since creating account, relative to review post date
u_month_age = (main_df.r_date - main_df.u_yelping_since) / np.timedelta64(1, 'M')

main_df["u_month_age"] = u_month_age

In [7]:
# remove unneeded cols
text_df = main_df[["r_id", "r_text"]]
main_df = main_df.drop(columns=["b_id", "r_funny", "r_cool", "u_id", "r_date", "r_text", "u_yelping_since"])

In [8]:
# Make column order same as Lee et. al
col_order = ["r_id", "r_stars", "r_stars_square", "r_length",
    "u_friends_count", "u_review_count", "u_month_age",
    "b_stars", "b_review_count",
    "r_sen", "r_sub", "r_rea", "r_useful"]
main_df = main_df[col_order]
main_df.head()

Unnamed: 0,r_id,r_stars,r_stars_square,r_length,u_friends_count,u_review_count,u_month_age,b_stars,b_review_count,r_sen,r_sub,r_rea,r_useful
0,4,5,25,43,1,9,11.538455,4.0,181,0.305871,0.775379,81.59,1
1,9,3,9,103,1,24,5.092347,4.5,13,0.027222,0.558611,76.15,1
2,11,5,25,179,7,39,76.740056,2.5,8,0.280583,0.48997,70.43,2
3,18,4,16,109,112,74,60.558653,4.0,398,0.130754,0.289683,78.59,1
4,19,5,25,78,8,27,48.32233,4.0,55,0.236861,0.368667,77.06,2


In [9]:
text_df.head()

Unnamed: 0,r_id,r_text
0,4,"Wow! Yummy, different, delicious. Our favo..."
1,9,This easter instead of going to Lopez Lake we ...
2,11,My experience with Shalimar was nothing but wo...
3,18,The hubby and I have been here on multiple occ...
4,19,I go to blow bar to get my brows done by natal...


In [10]:
# split df into train and remainder. Shuffles before split
train_main_df, remainder_main_df, train_text_df, remainder_text_df = train_test_split(
    main_df, text_df, train_size=0.8, random_state=RANDOM_SEED)

In [11]:
# split remainder into val and test. Shuffles before split
val_main_df, test_main_df, val_text_df, test_text_df = train_test_split(
    remainder_main_df, remainder_text_df, train_size=0.5, random_state=RANDOM_SEED)

In [12]:
# validate created splits proportions. should be about 0.8, 0.1, 0.1
print(len(train_main_df) / len(main_df), len(val_main_df) / len(main_df), len(test_main_df) / len(main_df))
print(len(train_text_df) / len(main_df), len(val_text_df) / len(main_df), len(test_text_df) / len(main_df))
# check records align in main df and text
print(np.all(train_main_df.index == train_text_df.index)) # want: TRUE
print(np.all(val_main_df.index == val_text_df.index)) # want: TRUE
print(np.all(test_main_df.index == test_text_df.index)) # want: TRUE
# check uniqueness of records
print(len(np.intersect1d(train_main_df.index, val_main_df.index)) == 0) # want: TRUE
print(len(np.intersect1d(train_main_df.index, test_main_df.index)) == 0) # want: TRUE
print(len(np.intersect1d(val_main_df.index, test_main_df.index)) == 0) # want: TRUE

0.7999998447074151 0.09999988353056138 0.10000027176202343
0.7999998447074151 0.09999988353056138 0.10000027176202343
True
True
True
True
True
True


In [13]:
# save output
train_main_df.to_parquet(READY_DATA_DIR/"train_main.parquet.snappy", index=False)
train_text_df.to_parquet(READY_DATA_DIR/"train_text.parquet.snappy", index=False)

val_main_df.to_parquet(READY_DATA_DIR/"val_main.parquet.snappy", index=False)
val_text_df.to_parquet(READY_DATA_DIR/"val_text.parquet.snappy", index=False)

test_main_df.to_parquet(READY_DATA_DIR/"test_main.parquet.snappy", index=False)
test_text_df.to_parquet(READY_DATA_DIR/"test_text.parquet.snappy", index=False)