# Importing Yelp Dataset

Author(s): Brian Lin, Yuqi Jiao (Anthony)

This notebook is for checking for combining all preprocessed data so that we obtain
the original 11 features of Lee et. al.

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
CWD = Path.cwd()
ROOT = CWD.parent
# path to processed data dir
PROC_PATH = CWD/"processed_data"
# path to experiment-ready data
READY_DATA_DIR = CWD.parent/"ready_data"

RANDOM_SEED = 2 # for reproducibility

In [3]:
main_df = pd.read_parquet(PROC_PATH/"joined.parquet.snappy")
is_english = pd.read_parquet(PROC_PATH/"joined_data_lang_detected.parquet", columns=["r_id", "is_english"])
is_english = is_english.astype({"r_id": int})
linguistic = pd.read_parquet(PROC_PATH/"joined_linguistic_extra.parquet.snappy")

In [4]:
# exclude non-english (values that = 1)
to_exclude = set(is_english.r_id[(is_english.is_english != 0)])
# negative useful votes
to_exclude = to_exclude.union(set(main_df.r_id[main_df.r_useful < 1]))
# reviews older than user
to_exclude = to_exclude.union(
    set(main_df.r_id[main_df.r_date <= main_df.u_yelping_since]))

print(f"excluding: {len(to_exclude)} records")
main_df = main_df[~main_df.r_id.isin(to_exclude)]

excluding: 872 records


In [5]:
# inner join on r_id
main_df = main_df.merge(linguistic, on="r_id", validate="1:1")
print(f"correct n rows: {main_df.shape[0] + len(to_exclude) == linguistic.shape[0]}")

correct n rows: True


In [6]:
# calculate elapsed month since creating account, relative to review post date
u_month_age = (main_df.r_date - main_df.u_yelping_since) / np.timedelta64(1, 'M')

main_df["u_month_age"] = u_month_age

In [7]:
# Reduce dataset size
main_df = main_df.sample(n=500000, random_state=RANDOM_SEED, ignore_index=True)

In [8]:
# remove unneeded cols
text_df = main_df[["r_id", "r_text"]]
main_df = main_df.drop(columns=["b_id", "r_funny", "r_cool", "u_id", "r_date", "r_text", "u_yelping_since"])

In [9]:
# Make column order same as Lee et. al
col_order = ["r_id", "r_stars", "r_stars_square", "r_length",
    "u_friends_count", "u_review_count", "u_month_age",
    "b_stars", "b_review_count",
    "r_sen", "r_sub", "r_rea", "r_useful"]
main_df = main_df[col_order]
main_df.head()

Unnamed: 0,r_id,r_stars,r_stars_square,r_length,u_friends_count,u_review_count,u_month_age,b_stars,b_review_count,r_sen,r_sub,r_rea,r_useful
0,5010335,5,25,34,413,16,33.555332,4.5,530,0.0625,0.1,97.03,1
1,751939,5,25,359,59,74,40.736506,4.5,181,0.190288,0.310182,90.39,3
2,5790894,5,25,121,300,30,26.827553,4.5,3999,0.215486,0.295833,94.15,1
3,2845635,5,25,52,62,65,27.535094,4.5,282,0.472222,0.493056,94.76,1
4,3033355,1,1,47,3,12,15.703912,3.5,46,-0.079861,0.485417,89.38,3


In [10]:
text_df.head()

Unnamed: 0,r_id,r_text
0,5010335,"Yes: Fried Egg Rice with tri-tip, Pork Bahn Mi..."
1,751939,So i'm new to the area since my new position i...
2,5790894,It was my first time trying St. Louis BBQ and ...
3,2845635,"More than Q, is just as advertised. More than ..."
4,3033355,I have to agree with the others. I purchased t...


In [11]:
# split df into train and test. Shuffles before split
train_main_df, test_main_df, train_text_df, test_text_df = train_test_split(
    main_df, text_df, train_size=0.8, random_state=RANDOM_SEED)

In [12]:
# validate created splits proportions. should be about 0.8, 0.2
print(len(train_main_df) / len(main_df), len(test_main_df) / len(main_df))
print(len(train_text_df) / len(main_df), len(test_text_df) / len(main_df))
# check records align in main df and text
print(np.all(train_main_df.index == train_text_df.index)) # want: TRUE
print(np.all(test_main_df.index == test_text_df.index)) # want: TRUE
# check uniqueness of records
print(len(np.intersect1d(train_main_df.index, test_main_df.index)) == 0) # want: TRUE

0.8 0.2
0.8 0.2
True
True
True


In [13]:
# save output
train_main_df.to_parquet(READY_DATA_DIR/"train_main.parquet.snappy", index=False)
train_text_df.to_parquet(READY_DATA_DIR/"train_text.parquet.snappy", index=False)

test_main_df.to_parquet(READY_DATA_DIR/"test_main.parquet.snappy", index=False)
test_text_df.to_parquet(READY_DATA_DIR/"test_text.parquet.snappy", index=False)