# Importing Yelp Dataset

Author(s): Brian Lin, Yuqi Jiao (Anthony)

This notebook is for obtaining reduced dataset with 100,000 records and 35 features

In [None]:
from pathlib import Path
from datetime import time, datetime, date
import json
import sqlite3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
CWD = Path.cwd()
ROOT = CWD.parent
# path to processed data dir
PROC_PATH = CWD/"processed_data"
# path to experiment-ready data
READY_DATA_DIR = CWD.parent/"ready_data"
DB_PATH = ROOT/"database/YelpData.db"

RANDOM_SEED = 760 # for reproducibility

Extract additional 4 features from database

In [None]:
conn = sqlite3.connect(DB_PATH)
joinStr = '''
SELECT
    r.id AS r_id,
    u.review_count AS u_review_count,
    u.compliment_hot AS u_comp_hot,
    u.compliment_more AS u_comp_more,
    u.compliment_profile AS u_comp_profile,
    u.compliment_cute AS u_comp_cute,
    u.compliment_list AS u_comp_list,
    u.compliment_note AS u_comp_note,
    u.compliment_plain AS u_comp_plain,
    u.compliment_cool AS u_comp_cool,
    u.compliment_funny AS u_comp_funny,
    u.compliment_writer AS u_comp_writer,
    u.compliment_photos AS u_comp_photos,
    u.elite AS u_elite,
    b.hours AS b_hours,
    u.fans AS u_fans,
    u.average_stars as u_avg_stars,
    u.useful as u_give_useful
FROM review AS r
LEFT JOIN business AS b
ON r.business_id=b.business_id
LEFT JOIN user AS u
ON r.user_id=u.user_id
WHERE b.is_open<>0 AND r.useful<>0
'''
# removes closed businesses and reviews with no useful votes
data = pd.read_sql(joinStr, conn, parse_dates=['r_date', 'u_yelping_since'])
conn.close()

Obtain user's average compliments per review

In [None]:
data = data[data.u_review_count != 0] # exclude zero review count records
data["u_comp_avg"] = data.u_comp_hot + data.u_comp_more + data.u_comp_profile + data.u_comp_cute + data.u_comp_list + data.u_comp_note + data.u_comp_plain + data.u_comp_cool + data.u_comp_funny + data.u_comp_writer + data.u_comp_photos
data.u_comp_avg = data.u_comp_avg / data.u_review_count

Obtain number of years user was elite

In [None]:
# error in data. "2020" has been split into "20,20" throughout. Rectify this.
# matches to any split 2020, except when it is the only year or 2020 is the last year.
data.u_elite = data.u_elite.str.replace(pat="20,20,", repl="2020,", regex=True)
# matches when 2020 is the only year or is the last year.
data.u_elite = data.u_elite.str.replace(pat="20,20$", repl="2020", regex=True)

# if row is None then this returns NaN.
# if no commas but not None (e.g single year only), then returns 0
u_n_elite_yrs = data.u_elite.str.count(",")
u_n_elite_yrs[(u_n_elite_yrs != 0) & ~u_n_elite_yrs.isna()] += 1
# no commas mean single year
u_n_elite_yrs[u_n_elite_yrs == 0] = 1
# na means no years
u_n_elite_yrs[u_n_elite_yrs.isna()] = 0
# convert from float to int
u_n_elite_yrs = u_n_elite_yrs.astype(int)
data["u_n_elite_yrs"] = u_n_elite_yrs

Obtain number of days open per week

In [None]:
def count_days_wk(hours_str):
    if hours_str:
        return len(json.loads(hours_str).keys())
    return np.nan

data["b_days_open_wk"] = data.b_hours.apply(count_days_wk)

Obtain number of hours open per week

In [None]:
# test = '''{"Monday": "8:30-17:30", "Tuesday": "17:0-2:0", "Sunday": "0:0-0:0"}'''
def count_hours_wk(hours_str):
    if hours_str:
        js = json.loads(hours_str)
        total_hrs = 0
        for day_hrs in js.values():
            # opening/ending times are formated as either x:0 or x:30, where x is [0,23]
            begin, end = day_hrs.split("-")
            # 0:0-0:0 indicates 24hrs
            if begin == end: total_hrs += 24
            else:
                # convert to time object
                b_hr, b_m = begin.split(":")
                e_hr, e_m = end.split(":")
                begin = time(hour=int(b_hr), minute=int(b_m))
                end= time(hour=int(e_hr), minute=int(e_m))

                # times that cross midnight gets -1 day, but correct number of hours
                diff = datetime.combine(date.min, end) - datetime.combine(date.min, begin)
                total_hrs += diff.seconds / 3600

        return total_hrs
    return np.nan

data["b_hours_open_wk"] = data.b_hours.apply(count_hours_wk)

Load in other data files for joining

In [None]:
main_df = pd.read_parquet(PROC_PATH/"joined.parquet.snappy")
is_english = pd.read_parquet(PROC_PATH/"joined_data_lang_detected.parquet", columns=["r_id", "is_english"])
is_english = is_english.astype({"r_id": int})
linguistic = pd.read_parquet(PROC_PATH/"joined_linguistic_extra.parquet.snappy")
new_nlp = pd.read_parquet(PROC_PATH/"newnlp.parquet.snappy")

Exclude invalid data

In [None]:
# exclude non-english (values that = 1)
to_exclude = set(is_english.r_id[(is_english.is_english != 0)])
# negative useful votes
to_exclude = to_exclude.union(set(main_df.r_id[main_df.r_useful < 1]))
# reviews older than user
to_exclude = to_exclude.union(
    set(main_df.r_id[main_df.r_date <= main_df.u_yelping_since]))

print(f"excluding: {len(to_exclude)} records")
main_df = main_df[~main_df.r_id.isin(to_exclude)]

Join data files

In [None]:
# Join in sentiment, subjectivity, readability. join on r_id
main_df = main_df.merge(linguistic, on="r_id", validate="1:1")
print(f"correct n rows: {main_df.shape[0] + len(to_exclude) == linguistic.shape[0]}")

# Join in other NLP numeric features
new_nlp = new_nlp.drop(columns=["r_useful", "r_text"]) # remove redundant
main_df = main_df.merge(new_nlp, on="r_id", validate="1:1")

In [None]:
# Remove unnecessary columns
data = data[["r_id", "u_comp_avg", "u_n_elite_yrs", "b_days_open_wk", "b_hours_open_wk", "u_fans", "u_avg_stars", "u_give_useful"]]
main_df = main_df.merge(data, on="r_id", validate="1:1")

In [None]:
# calculate elapsed month since creating account, relative to review post date
u_month_age = (main_df.r_date - main_df.u_yelping_since) / np.timedelta64(1, 'M')

main_df["u_month_age"] = u_month_age

In [None]:
# Reduce dataset size
main_df = main_df.sample(n=100000, random_state=RANDOM_SEED, ignore_index=True)

In [None]:
# remove unneeded cols
text_df = main_df[["r_id", "r_useful", "r_text"]]
main_df = main_df.drop(columns=["b_id", "r_funny", "r_cool", "u_id", "r_date", "r_text", "u_yelping_since"])

In [None]:
# Order features by category. r_id first, r_useful last.
col_order = ["r_id", "r_stars", "r_stars_square", "r_length",
    "u_friends_count", "u_review_count", "u_month_age", "u_comp_avg", "u_n_elite_yrs", "u_fans", "u_avg_stars", "u_give_useful",
    "b_stars", "b_review_count", "b_days_open_wk", "b_hours_open_wk",
    "r_sen", "r_sub", "r_rea",
    'r_word_cnt', 'r_character_cnt', 'r_sent_cnt', 'r_unique_word_cnt',
    'r_stopword_cnt', 'r_avg_wordlength', 'r_avg_sentlength', 'r_unique/words',
    'r_stopwords/words','r_digit_cnt', 'r_noun_cnt', 'r_Adj_cnt', 'r_Adv_cnt',
    'r_capital_word_cnt', 'r_quoted_word_cnt', 'r_hashtag_cnt', 'r_exclam_cnt',
    "r_useful"]
main_df = main_df[col_order]
main_df.head()

In [None]:
text_df.head()

Extract and save dataset files

In [None]:
# split df into train and remainder. Shuffles before split
train_main_df, rem_main_df, train_text_df, rem_text_df = train_test_split(
    main_df, text_df, train_size=0.8, random_state=RANDOM_SEED)

# split remainder into val and test. Shuffles before split
val_main_df, test_main_df, val_text_df, test_text_df = train_test_split(
    rem_main_df, rem_text_df, train_size=0.5, random_state=RANDOM_SEED)

In [None]:
# validate created splits proportions. should be about 0.8, 0.1, 0.1
print(len(train_main_df) / len(main_df), len(val_main_df) / len(main_df), len(test_main_df) / len(main_df))
print(len(train_text_df) / len(main_df), len(val_text_df) / len(main_df), len(test_text_df) / len(main_df))
# check records align in main df and text
print(np.all(train_main_df.index == train_text_df.index)) # want: TRUE
print(np.all(test_main_df.index == test_text_df.index)) # want: TRUE
print(np.all(val_main_df.index == val_text_df.index)) # want: TRUE
# check uniqueness of records
print(len(np.intersect1d(train_main_df.index, test_main_df.index)) == 0) # want: TRUE
print(len(np.intersect1d(train_main_df.index, val_main_df.index)) == 0) # want: TRUE
print(len(np.intersect1d(val_main_df.index, test_main_df.index)) == 0) # want: TRUE

In [None]:
# save output
train_main_df.to_parquet(READY_DATA_DIR/"100K35F_train_main.parquet.snappy", index=False)
train_text_df.to_parquet(READY_DATA_DIR/"100K35F_train_text.parquet.snappy", index=False)

val_main_df.to_parquet(READY_DATA_DIR/"100K35F_val_main.parquet.snappy", index=False)
val_text_df.to_parquet(READY_DATA_DIR/"100K35F_val_text.parquet.snappy", index=False)

test_main_df.to_parquet(READY_DATA_DIR/"100K35F_test_main.parquet.snappy", index=False)
test_text_df.to_parquet(READY_DATA_DIR/"100K35F_test_text.parquet.snappy", index=False)

In [None]:
train_main_df

In [None]:
val_main_df

In [None]:
test_main_df