# Importing Yelp Dataset

Author(s): Brian Lin, Yuqi Jiao (Anthony)

This notebook is for obtaining reduced dataset with 100,000 records and 35 features

In [1]:
from pathlib import Path
from datetime import time, datetime, date
import json
import sqlite3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
CWD = Path.cwd()
ROOT = CWD.parent
# path to processed data dir
PROC_PATH = CWD/"processed_data"
# path to experiment-ready data
READY_DATA_DIR = CWD.parent/"ready_data"
DB_PATH = ROOT/"database/YelpData.db"

RANDOM_SEED = 760 # for reproducibility

Extract additional features from database

In [3]:
conn = sqlite3.connect(DB_PATH)
joinStr = '''
SELECT
    r.id AS r_id,
    u.review_count AS u_review_count,
    u.compliment_hot AS u_comp_hot,
    u.compliment_more AS u_comp_more,
    u.compliment_profile AS u_comp_profile,
    u.compliment_cute AS u_comp_cute,
    u.compliment_list AS u_comp_list,
    u.compliment_note AS u_comp_note,
    u.compliment_plain AS u_comp_plain,
    u.compliment_cool AS u_comp_cool,
    u.compliment_funny AS u_comp_funny,
    u.compliment_writer AS u_comp_writer,
    u.compliment_photos AS u_comp_photos,
    u.elite AS u_elite,
    b.hours AS b_hours,
    u.fans AS u_fans,
    u.average_stars as u_avg_stars,
    u.useful as u_give_useful
FROM review AS r
LEFT JOIN business AS b
ON r.business_id=b.business_id
LEFT JOIN user AS u
ON r.user_id=u.user_id
WHERE b.is_open<>0 AND r.useful<>0
'''
# removes closed businesses and reviews with no useful votes
data = pd.read_sql(joinStr, conn, parse_dates=['r_date', 'u_yelping_since'])
conn.close()

Obtain user's average compliments per review

In [4]:
data = data[data.u_review_count != 0] # exclude zero review count records
data["u_comp_avg"] = data.u_comp_hot + data.u_comp_more + data.u_comp_profile + data.u_comp_cute + data.u_comp_list + data.u_comp_note + data.u_comp_plain + data.u_comp_cool + data.u_comp_funny + data.u_comp_writer + data.u_comp_photos
data.u_comp_avg = data.u_comp_avg / data.u_review_count

Obtain number of years user was elite

In [5]:
# error in data. "2020" has been split into "20,20" throughout. Rectify this.
# matches to any split 2020, except when it is the only year or 2020 is the last year.
data.u_elite = data.u_elite.str.replace(pat="20,20,", repl="2020,", regex=True)
# matches when 2020 is the only year or is the last year.
data.u_elite = data.u_elite.str.replace(pat="20,20$", repl="2020", regex=True)

# if row is None then this returns NaN.
# if no commas but not None (e.g single year only), then returns 0
u_n_elite_yrs = data.u_elite.str.count(",")
u_n_elite_yrs[(u_n_elite_yrs != 0) & ~u_n_elite_yrs.isna()] += 1
# no commas mean single year
u_n_elite_yrs[u_n_elite_yrs == 0] = 1
# na means no years
u_n_elite_yrs[u_n_elite_yrs.isna()] = 0
# convert from float to int
u_n_elite_yrs = u_n_elite_yrs.astype(int)
data["u_n_elite_yrs"] = u_n_elite_yrs

Obtain number of days open per week

In [6]:
def count_days_wk(hours_str):
    if hours_str:
        return len(json.loads(hours_str).keys())
    return np.nan

data["b_days_open_wk"] = data.b_hours.apply(count_days_wk)

Obtain number of hours open per week

In [7]:
# test = '''{"Monday": "8:30-17:30", "Tuesday": "17:0-2:0", "Sunday": "0:0-0:0"}'''
def count_hours_wk(hours_str):
    if hours_str:
        js = json.loads(hours_str)
        total_hrs = 0
        for day_hrs in js.values():
            # opening/ending times are formated as either x:0 or x:30, where x is [0,23]
            begin, end = day_hrs.split("-")
            # 0:0-0:0 indicates 24hrs
            if begin == end: total_hrs += 24
            else:
                # convert to time object
                b_hr, b_m = begin.split(":")
                e_hr, e_m = end.split(":")
                begin = time(hour=int(b_hr), minute=int(b_m))
                end= time(hour=int(e_hr), minute=int(e_m))

                # times that cross midnight gets -1 day, but correct number of hours
                diff = datetime.combine(date.min, end) - datetime.combine(date.min, begin)
                total_hrs += diff.seconds / 3600

        return total_hrs
    return np.nan

data["b_hours_open_wk"] = data.b_hours.apply(count_hours_wk)

Load in other data files for joining

In [8]:
main_df = pd.read_parquet(PROC_PATH/"joined.parquet.snappy")
is_english = pd.read_parquet(PROC_PATH/"joined_data_lang_detected.parquet", columns=["r_id", "is_english"])
is_english = is_english.astype({"r_id": int})
linguistic = pd.read_parquet(PROC_PATH/"joined_linguistic_extra.parquet.snappy")
new_nlp = pd.read_parquet(PROC_PATH/"newnlp.parquet.snappy")

Exclude invalid data

In [9]:
# exclude non-english (values that = 1)
to_exclude = set(is_english.r_id[(is_english.is_english != 0)])
# negative useful votes
to_exclude = to_exclude.union(set(main_df.r_id[main_df.r_useful < 1]))
# reviews older than user
to_exclude = to_exclude.union(
    set(main_df.r_id[main_df.r_date <= main_df.u_yelping_since]))

print(f"excluding: {len(to_exclude)} records")
main_df = main_df[~main_df.r_id.isin(to_exclude)]

excluding: 872 records


Obtain user account age at time of review posting

In [10]:
# calculate elapsed month since creating account, relative to review post date
u_month_age = (main_df.r_date - main_df.u_yelping_since) / np.timedelta64(1, 'M')

main_df["u_month_age"] = u_month_age

Join data files

In [11]:
# Join in sentiment, subjectivity, readability. join on r_id
main_df = main_df.merge(linguistic, on="r_id", validate="1:1")
print(f"correct n rows: {main_df.shape[0] + len(to_exclude) == linguistic.shape[0]}")

# Join in other NLP numeric features
new_nlp = new_nlp.drop(columns=["r_useful", "r_text"]) # remove redundant
main_df = main_df.merge(new_nlp, on="r_id", validate="1:1")

correct n rows: True


In [12]:
# Remove unnecessary columns
data = data[["r_id", "u_comp_avg", "u_n_elite_yrs", "b_days_open_wk", "b_hours_open_wk", "u_fans", "u_avg_stars", "u_give_useful"]]
main_df = main_df.merge(data, on="r_id", validate="1:1")

In [13]:
# Reduce dataset size
main_df = main_df.sample(n=100000, random_state=RANDOM_SEED, ignore_index=True)

In [14]:
# remove unneeded cols
text_df = main_df[["r_id", "r_useful", "r_text"]]
main_df = main_df.drop(columns=["b_id", "r_funny", "r_cool", "u_id", "r_date", "r_text", "u_yelping_since"])

In [15]:
# Order features by category. r_id first, r_useful last.
col_order = ["r_id", "r_stars", "r_stars_square", "r_length",
    "u_friends_count", "u_review_count", "u_month_age", "u_comp_avg", "u_n_elite_yrs", "u_fans", "u_avg_stars", "u_give_useful",
    "b_stars", "b_review_count", "b_days_open_wk", "b_hours_open_wk",
    "r_sen", "r_sub", "r_rea",
    'r_word_cnt', 'r_character_cnt', 'r_sent_cnt', 'r_unique_word_cnt',
    'r_stopword_cnt', 'r_avg_wordlength', 'r_avg_sentlength', 'r_unique/words',
    'r_stopwords/words','r_digit_cnt', 'r_noun_cnt', 'r_Adj_cnt', 'r_Adv_cnt',
    'r_capital_word_cnt', 'r_quoted_word_cnt', 'r_hashtag_cnt', 'r_exclam_cnt',
    "r_useful"]
main_df = main_df[col_order]
main_df.head()

Unnamed: 0,r_id,r_stars,r_stars_square,r_length,u_friends_count,u_review_count,u_month_age,u_comp_avg,u_n_elite_yrs,u_fans,...,r_stopwords/words,r_digit_cnt,r_noun_cnt,r_Adj_cnt,r_Adv_cnt,r_capital_word_cnt,r_quoted_word_cnt,r_hashtag_cnt,r_exclam_cnt,r_useful
0,3877205,2,4,115,3,94,70.203686,0.010638,0,1,...,0.41,0,23,10,10,0,0,0,0,1
1,3150433,5,25,29,408,18,0.46562,0.111111,0,2,...,0.48,0,4,1,2,0,0,0,0,3
2,3260011,1,1,276,53,18,28.351284,0.0,0,0,...,0.44,1,51,25,25,3,0,0,3,1
3,3973948,4,16,362,1787,3131,62.183969,3.066752,9,703,...,0.44,0,79,43,19,4,0,0,0,31
4,6777040,2,4,172,59,99,45.109452,0.646465,4,10,...,0.4,2,31,9,21,9,0,0,6,1


In [16]:
text_df.head()

Unnamed: 0,r_id,r_useful,r_text
0,3877205,1,Probably won't return to this location. \n\nFo...
1,3150433,3,There's always something new to discover when ...
2,3260011,1,Came here years ago with a friend who got a ba...
3,3973948,31,"Robin Hood, the robbing hoodlum of Sherwood Fo..."
4,6777040,1,"Oh Polar Pop, how I love thee!!! Only 84 cents..."


Extract and save dataset files

In [17]:
# split df into train and remainder. Shuffles before split
train_main_df, rem_main_df, train_text_df, rem_text_df = train_test_split(
    main_df, text_df, train_size=0.8, random_state=RANDOM_SEED)

# split remainder into val and test. Shuffles before split
val_main_df, test_main_df, val_text_df, test_text_df = train_test_split(
    rem_main_df, rem_text_df, train_size=0.5, random_state=RANDOM_SEED)

In [18]:
# validate created splits proportions. should be about 0.8, 0.1, 0.1
print(len(train_main_df) / len(main_df), len(val_main_df) / len(main_df), len(test_main_df) / len(main_df))
print(len(train_text_df) / len(main_df), len(val_text_df) / len(main_df), len(test_text_df) / len(main_df))
# check records align in main df and text
print(np.all(train_main_df.index == train_text_df.index)) # want: TRUE
print(np.all(test_main_df.index == test_text_df.index)) # want: TRUE
print(np.all(val_main_df.index == val_text_df.index)) # want: TRUE
# check uniqueness of records
print(len(np.intersect1d(train_main_df.index, test_main_df.index)) == 0) # want: TRUE
print(len(np.intersect1d(train_main_df.index, val_main_df.index)) == 0) # want: TRUE
print(len(np.intersect1d(val_main_df.index, test_main_df.index)) == 0) # want: TRUE

0.8 0.1 0.1
0.8 0.1 0.1
True
True
True
True
True
True


In [19]:
train_main_df

Unnamed: 0,r_id,r_stars,r_stars_square,r_length,u_friends_count,u_review_count,u_month_age,u_comp_avg,u_n_elite_yrs,u_fans,...,r_stopwords/words,r_digit_cnt,r_noun_cnt,r_Adj_cnt,r_Adv_cnt,r_capital_word_cnt,r_quoted_word_cnt,r_hashtag_cnt,r_exclam_cnt,r_useful
3988,2486117,5,25,41,1,8,35.657224,0.000000,0,0,...,0.46,2,8,3,2,1,0,0,0,2
41592,3238107,5,25,64,174,247,1.024218,0.222672,5,27,...,0.35,1,13,5,2,4,0,0,2,2
16712,5581800,5,25,239,525,603,125.863690,2.844113,12,57,...,0.41,2,53,16,20,8,0,0,1,3
40385,5953680,4,16,129,130,183,129.975625,2.005464,5,26,...,0.46,0,32,12,6,2,0,0,0,4
98321,2208283,1,1,537,223,1,0.000111,1.000000,0,0,...,0.47,2,93,23,42,26,0,0,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8767,2065766,2,4,61,165,108,0.640257,0.046296,0,2,...,0.41,0,10,4,5,5,0,0,1,2
13090,4640630,4,16,74,78,467,47.038862,0.353319,6,35,...,0.45,0,18,9,2,3,0,0,1,2
48778,4218981,5,25,98,12,88,79.607721,0.295455,2,5,...,0.34,0,28,7,7,3,0,0,1,2
68005,4633311,4,16,656,173,206,41.955031,0.626214,4,13,...,0.43,2,117,39,61,35,0,0,7,6


In [20]:
val_main_df

Unnamed: 0,r_id,r_stars,r_stars_square,r_length,u_friends_count,u_review_count,u_month_age,u_comp_avg,u_n_elite_yrs,u_fans,...,r_stopwords/words,r_digit_cnt,r_noun_cnt,r_Adj_cnt,r_Adv_cnt,r_capital_word_cnt,r_quoted_word_cnt,r_hashtag_cnt,r_exclam_cnt,r_useful
19769,5941783,1,1,86,178,7,28.522784,0.000000,0,0,...,0.34,0,15,9,9,6,0,0,4,3
66629,1271743,5,25,314,55,494,9.399588,0.141700,3,16,...,0.38,2,78,36,22,5,0,0,1,1
32825,2312321,4,16,148,151,80,21.323776,1.025000,5,7,...,0.46,0,24,13,18,7,0,0,0,1
30831,5980637,3,9,96,1,38,78.190788,0.078947,0,0,...,0.40,0,20,7,1,1,0,0,0,1
19847,220850,5,25,81,68,24,87.446179,0.083333,0,0,...,0.33,0,14,9,13,3,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35457,6864011,3,9,32,1,18,10.865445,0.000000,0,1,...,0.38,0,5,6,3,1,0,0,0,1
65561,6231868,5,25,222,300,157,42.642670,0.382166,2,7,...,0.45,2,36,18,22,6,0,0,2,1
83844,4058392,4,16,188,227,1047,48.777820,0.187202,4,43,...,0.40,1,49,20,13,2,0,0,1,6
41832,3281763,4,16,105,427,1145,140.879457,0.356332,12,98,...,0.38,0,22,15,13,4,0,0,0,2


In [21]:
test_main_df

Unnamed: 0,r_id,r_stars,r_stars_square,r_length,u_friends_count,u_review_count,u_month_age,u_comp_avg,u_n_elite_yrs,u_fans,...,r_stopwords/words,r_digit_cnt,r_noun_cnt,r_Adj_cnt,r_Adv_cnt,r_capital_word_cnt,r_quoted_word_cnt,r_hashtag_cnt,r_exclam_cnt,r_useful
9142,1340676,1,1,305,1,1,37.547287,0.000000,0,0,...,0.42,1,59,17,27,12,0,0,0,1
49932,941404,4,16,20,2,96,40.146040,0.135417,0,4,...,0.32,0,3,4,4,0,0,0,1,1
10204,1312730,1,1,285,333,24,39.063471,0.083333,0,2,...,0.49,4,40,10,28,14,0,0,0,3
26396,5155149,4,16,139,273,374,7.853903,2.077540,7,37,...,0.43,2,32,8,9,4,0,0,0,4
17206,3706898,4,16,41,1,5,11.606999,0.000000,0,0,...,0.45,1,6,5,5,1,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9384,4134229,5,25,95,1,12,34.229801,0.000000,0,0,...,0.48,0,15,8,8,0,0,0,0,1
6942,4059522,3,9,454,149,361,101.389702,0.883657,8,49,...,0.42,0,92,37,23,27,0,0,0,3
93190,5567999,3,9,347,352,286,92.625418,0.527972,5,36,...,0.46,2,71,26,28,10,0,0,3,2
45986,2980845,4,16,175,217,219,54.729315,1.205479,4,17,...,0.45,0,33,17,11,8,0,0,0,2


In [22]:
test_main_df.dtypes

r_id                    int64
r_stars                 int64
r_stars_square          int64
r_length                int64
u_friends_count         int64
u_review_count          int64
u_month_age           float64
u_comp_avg            float64
u_n_elite_yrs           int64
u_fans                  int64
u_avg_stars           float64
u_give_useful           int64
b_stars               float64
b_review_count          int64
b_days_open_wk        float64
b_hours_open_wk       float64
r_sen                 float64
r_sub                 float64
r_rea                 float64
r_word_cnt              int64
r_character_cnt         int64
r_sent_cnt              int64
r_unique_word_cnt       int64
r_stopword_cnt          int64
r_avg_wordlength      float64
r_avg_sentlength      float64
r_unique/words        float64
r_stopwords/words     float64
r_digit_cnt             int64
r_noun_cnt              int64
r_Adj_cnt               int64
r_Adv_cnt               int64
r_capital_word_cnt      int64
r_quoted_w

In [23]:
# save output
train_main_df.to_parquet(READY_DATA_DIR/"100K35F_train_main.parquet.snappy", index=False)
train_text_df.to_parquet(READY_DATA_DIR/"100K35F_train_text.parquet.snappy", index=False)

val_main_df.to_parquet(READY_DATA_DIR/"100K35F_val_main.parquet.snappy", index=False)
val_text_df.to_parquet(READY_DATA_DIR/"100K35F_val_text.parquet.snappy", index=False)

test_main_df.to_parquet(READY_DATA_DIR/"100K35F_test_main.parquet.snappy", index=False)
test_text_df.to_parquet(READY_DATA_DIR/"100K35F_test_text.parquet.snappy", index=False)