# Importing Yelp Dataset

Author(s): Brian Lin, Yuqi Jiao (Anthony)

This notebook is for obtaining reduced dataset with 500,000 records and 18 features

In [1]:
from pathlib import Path
from datetime import time, datetime, date
import json
import sqlite3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
CWD = Path.cwd()
ROOT = CWD.parent
# path to processed data dir
PROC_PATH = CWD/"processed_data"
# path to experiment-ready data
READY_DATA_DIR = CWD.parent/"ready_data"
DB_PATH = ROOT/"database/YelpData.db"

RANDOM_SEED = 2 # for reproducibility

Extract additional 4 features from database

In [3]:
conn = sqlite3.connect(DB_PATH)
joinStr = '''
SELECT
    r.id AS r_id,
    u.review_count AS u_review_count,
    u.compliment_hot AS u_comp_hot,
    u.compliment_more AS u_comp_more,
    u.compliment_profile AS u_comp_profile,
    u.compliment_cute AS u_comp_cute,
    u.compliment_list AS u_comp_list,
    u.compliment_note AS u_comp_note,
    u.compliment_plain AS u_comp_plain,
    u.compliment_cool AS u_comp_cool,
    u.compliment_funny AS u_comp_funny,
    u.compliment_writer AS u_comp_writer,
    u.compliment_photos AS u_comp_photos,
    u.elite AS u_elite,
    b.hours AS b_hours,
    u.fans AS u_fans,
    u.average_stars as u_avg_stars,
    u.useful as u_give_useful
FROM review AS r
LEFT JOIN business AS b
ON r.business_id=b.business_id
LEFT JOIN user AS u
ON r.user_id=u.user_id
WHERE b.is_open<>0 AND r.useful<>0
'''
# removes closed businesses and reviews with no useful votes
data = pd.read_sql(joinStr, conn, parse_dates=['r_date', 'u_yelping_since'])
conn.close()

Obtain user's average compliments per review

In [4]:
data = data[data.u_review_count != 0] # exclude zero review count records
data["u_comp_avg"] = data.u_comp_hot + data.u_comp_more + data.u_comp_profile + data.u_comp_cute + data.u_comp_list + data.u_comp_note + data.u_comp_plain + data.u_comp_cool + data.u_comp_funny + data.u_comp_writer + data.u_comp_photos
data.u_comp_avg = data.u_comp_avg / data.u_review_count

Obtain number of years user was elite

In [5]:
# error in data. "2020" has been split into "20,20" throughout. Rectify this.
# matches to any split 2020, except when it is the only year or 2020 is the last year.
data.u_elite = data.u_elite.str.replace(pat="20,20,", repl="2020,", regex=True)
# matches when 2020 is the only year or is the last year.
data.u_elite = data.u_elite.str.replace(pat="20,20$", repl="2020", regex=True)

# if row is None then this returns NaN.
# if no commas but not None (e.g single year only), then returns 0
u_n_elite_yrs = data.u_elite.str.count(",")
u_n_elite_yrs[(u_n_elite_yrs != 0) & ~u_n_elite_yrs.isna()] += 1
# no commas mean single year
u_n_elite_yrs[u_n_elite_yrs == 0] = 1
# na means no years
u_n_elite_yrs[u_n_elite_yrs.isna()] = 0
# convert from float to int
u_n_elite_yrs = u_n_elite_yrs.astype(int)
data["u_n_elite_yrs"] = u_n_elite_yrs

Obtain number of days open per week

In [6]:
def count_days_wk(hours_str):
    if hours_str:
        return len(json.loads(hours_str).keys())
    return np.nan

data["b_days_open_wk"] = data.b_hours.apply(count_days_wk)

Obtain number of hours open per week

In [7]:
# test = '''{"Monday": "8:30-17:30", "Tuesday": "17:0-2:0", "Sunday": "0:0-0:0"}'''
def count_hours_wk(hours_str):
    if hours_str:
        js = json.loads(hours_str)
        total_hrs = 0
        for day_hrs in js.values():
            # opening/ending times are formated as either x:0 or x:30, where x is [0,23]
            begin, end = day_hrs.split("-")
            # 0:0-0:0 indicates 24hrs
            if begin == end: total_hrs += 24
            else:
                # convert to time object
                b_hr, b_m = begin.split(":")
                e_hr, e_m = end.split(":")
                begin = time(hour=int(b_hr), minute=int(b_m))
                end= time(hour=int(e_hr), minute=int(e_m))

                # times that cross midnight gets -1 day, but correct number of hours
                diff = datetime.combine(date.min, end) - datetime.combine(date.min, begin)
                total_hrs += diff.seconds / 3600

        return total_hrs
    return np.nan

data["b_hours_open_wk"] = data.b_hours.apply(count_hours_wk)

Load in other data files for joining

In [8]:
main_df = pd.read_parquet(PROC_PATH/"joined.parquet.snappy")
is_english = pd.read_parquet(PROC_PATH/"joined_data_lang_detected.parquet", columns=["r_id", "is_english"])
is_english = is_english.astype({"r_id": int})
linguistic = pd.read_parquet(PROC_PATH/"joined_linguistic_extra.parquet.snappy")

Exclude invalid data

In [9]:
# exclude non-english (values that = 1)
to_exclude = set(is_english.r_id[(is_english.is_english != 0)])
# negative useful votes
to_exclude = to_exclude.union(set(main_df.r_id[main_df.r_useful < 1]))
# reviews older than user
to_exclude = to_exclude.union(
    set(main_df.r_id[main_df.r_date <= main_df.u_yelping_since]))

print(f"excluding: {len(to_exclude)} records")
main_df = main_df[~main_df.r_id.isin(to_exclude)]

excluding: 872 records


Join data files

In [10]:
# inner join on r_id
main_df = main_df.merge(linguistic, on="r_id", validate="1:1")
print(f"correct n rows: {main_df.shape[0] + len(to_exclude) == linguistic.shape[0]}")

correct n rows: True


In [11]:
# Remove unnecessary columns
data = data[["r_id", "u_comp_avg", "u_n_elite_yrs", "b_days_open_wk", "b_hours_open_wk", "u_fans", "u_avg_stars", "u_give_useful"]]
main_df = main_df.merge(data, on="r_id", validate="1:1")

In [12]:
# calculate elapsed month since creating account, relative to review post date
u_month_age = (main_df.r_date - main_df.u_yelping_since) / np.timedelta64(1, 'M')

main_df["u_month_age"] = u_month_age

In [13]:
# Reduce dataset size
main_df = main_df.sample(n=100000, random_state=RANDOM_SEED, ignore_index=True)

In [14]:
# remove unneeded cols
text_df = main_df[["r_id", "r_useful", "r_text"]]
main_df = main_df.drop(columns=["b_id", "r_funny", "r_cool", "u_id", "r_date", "r_text", "u_yelping_since"])

In [15]:
# Make column order same as Lee et. al
col_order = ["r_id", "r_stars", "r_stars_square", "r_length",
    "u_friends_count", "u_review_count", "u_month_age", "u_comp_avg", "u_n_elite_yrs", "u_fans", "u_avg_stars", "u_give_useful",
    "b_stars", "b_review_count", "b_days_open_wk", "b_hours_open_wk",
    "r_sen", "r_sub", "r_rea", "r_useful"]
main_df = main_df[col_order]
main_df.head()

Unnamed: 0,r_id,r_stars,r_stars_square,r_length,u_friends_count,u_review_count,u_month_age,u_comp_avg,u_n_elite_yrs,u_fans,u_avg_stars,u_give_useful,b_stars,b_review_count,b_days_open_wk,b_hours_open_wk,r_sen,r_sub,r_rea,r_useful
0,2807297,1,1,50,3,96,33.603302,0.041667,0,0,2.47,103,2.5,15,6.0,37.5,0.026157,0.464815,86.91,6
1,4363965,1,1,34,1,13,39.026553,0.0,0,0,3.64,9,3.0,16,,,-0.216667,0.516667,76.93,2
2,758206,5,25,20,8,174,47.636438,0.063218,0,0,3.91,127,4.5,619,5.0,23.0,0.538519,0.583333,73.13,1
3,5539588,4,16,144,238,363,50.347358,0.176309,5,38,3.69,587,4.0,466,7.0,69.0,0.209673,0.300595,95.06,2
4,2720332,4,16,47,107,244,53.719413,0.147541,7,21,4.26,190,4.5,3837,7.0,77.0,0.083333,0.366667,81.33,1


In [16]:
text_df.head()

Unnamed: 0,r_id,r_useful,r_text
0,2807297,6,Not impressed. This is my second time coming i...
1,4363965,2,Worst haircut I ever got from one of the styli...
2,758206,1,The food was delicious; the waitstaff was exce...
3,5539588,2,Saw the Groupon and wanted to check it out whe...
4,2720332,1,Most of the food tastes good here. Unfortunate...


Extract and save dataset files

In [17]:
# split df into train and test. Shuffles before split
train_main_df, test_main_df, train_text_df, test_text_df = train_test_split(
    main_df, text_df, train_size=0.8, random_state=RANDOM_SEED)

In [18]:
# validate created splits proportions. should be about 0.8, 0.2
print(len(train_main_df) / len(main_df), len(test_main_df) / len(main_df))
print(len(train_text_df) / len(main_df), len(test_text_df) / len(main_df))
# check records align in main df and text
print(np.all(train_main_df.index == train_text_df.index)) # want: TRUE
print(np.all(test_main_df.index == test_text_df.index)) # want: TRUE
# check uniqueness of records
print(len(np.intersect1d(train_main_df.index, test_main_df.index)) == 0) # want: TRUE

0.8 0.2
0.8 0.2
True
True
True


In [19]:
# save output
train_main_df.to_parquet(READY_DATA_DIR/"100K18F_train_main.parquet.snappy", index=False)
train_text_df.to_parquet(READY_DATA_DIR/"100K18F_train_text.parquet.snappy", index=False)

test_main_df.to_parquet(READY_DATA_DIR/"100K18F_test_main.parquet.snappy", index=False)
test_text_df.to_parquet(READY_DATA_DIR/"100K18F_test_text.parquet.snappy", index=False)

In [20]:
train_main_df

Unnamed: 0,r_id,r_stars,r_stars_square,r_length,u_friends_count,u_review_count,u_month_age,u_comp_avg,u_n_elite_yrs,u_fans,u_avg_stars,u_give_useful,b_stars,b_review_count,b_days_open_wk,b_hours_open_wk,r_sen,r_sub,r_rea,r_useful
34555,2235679,4,16,67,202,21,32.238014,0.047619,0,2,3.45,14,4.5,125,7.0,94.5,0.121429,0.186508,75.00,1
4442,6902682,2,4,213,85,31,94.989029,0.000000,0,0,3.48,31,3.0,871,7.0,67.0,0.067766,0.434259,79.40,1
50811,1590330,4,16,601,148,525,53.090383,0.169524,7,45,3.76,707,4.5,577,6.0,46.5,0.148929,0.360100,87.55,1
45326,6289537,2,4,46,99,163,141.812495,0.349693,3,11,3.81,218,3.0,143,7.0,78.0,0.037500,0.195833,89.55,1
57679,2112180,5,25,27,148,74,0.002077,0.716216,5,6,4.83,119,4.5,2878,6.0,68.5,0.062500,0.111111,98.72,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31019,5600012,5,25,52,253,172,28.889350,1.151163,5,23,3.57,529,4.5,80,5.0,48.0,0.202500,0.385000,75.20,2
44566,3705959,5,25,177,1,7,59.596891,0.285714,0,0,3.29,4,4.5,14,5.0,42.0,0.133959,0.312023,85.69,1
95816,2646726,3,9,434,164,471,70.677502,0.116773,4,22,3.62,716,4.5,454,6.0,64.0,0.241433,0.523121,77.16,1
72173,5823855,3,9,255,43,106,14.543292,0.433962,4,9,3.58,157,3.5,36,7.0,119.0,0.140165,0.440087,84.10,3


In [21]:
test_main_df

Unnamed: 0,r_id,r_stars,r_stars_square,r_length,u_friends_count,u_review_count,u_month_age,u_comp_avg,u_n_elite_yrs,u_fans,u_avg_stars,u_give_useful,b_stars,b_review_count,b_days_open_wk,b_hours_open_wk,r_sen,r_sub,r_rea,r_useful
6448,3136926,5,25,46,10,15,12.419571,0.000000,0,0,4.67,10,4.5,3582,6.0,24.0,0.471354,0.647917,85.18,3
50680,3972206,5,25,41,613,1507,151.900652,1.976111,6,224,4.16,4816,4.0,403,7.0,59.5,0.227143,0.421429,88.53,2
814,897833,3,9,140,8,13,0.246227,0.692308,0,0,3.23,30,3.0,83,7.0,72.0,0.196667,0.522778,72.56,1
20354,6173514,4,16,182,1691,849,30.020638,4.203769,6,251,4.20,6045,3.5,759,7.0,94.0,0.242222,0.458333,93.03,1
49224,6437162,1,1,165,82,14,5.685124,0.000000,0,1,4.27,11,1.0,14,7.0,168.0,-0.051515,0.439394,80.41,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47661,2017374,5,25,150,71,9,33.279050,0.000000,0,1,3.67,3,4.5,2160,6.0,64.0,0.289012,0.389279,81.73,1
56256,5277542,1,1,146,1,26,94.311783,0.038462,0,0,3.59,42,2.5,787,,,-0.040741,0.345370,82.65,1
92555,5002546,3,9,46,1,33,39.691142,0.090909,0,0,2.47,26,2.5,62,7.0,70.0,0.266667,0.300000,82.48,1
36328,2865497,1,1,164,1,1,3.695351,0.000000,0,0,1.00,1,2.5,1576,7.0,168.0,-0.000389,0.187668,80.51,1
