In [2]:
# Download Yelp dataset
# Run experiments

In [9]:
import folktables
import numpy as np

In [3]:
from folktables import ACSDataSource, ACSEmployment

data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["AL"], download=True)
features, label, group = ACSEmployment.df_to_numpy(acs_data)


In [10]:
np.unique(group)

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

## 1. Data Cleaning

Papers that use the dataset:
 - [Character-level Convolutional Networks for Text
Classification (2016)](https://arxiv.org/abs/1509.01626)

"Two classification tasks are constructed
from this dataset â€“ one predicting full number of stars the user has given, and the other predicting a polarity label by considering stars 1 and 2 negative, and 3 and 4 positive"

Sampling methods:
 - Categorical (e.g., Chinese)
 - Time (e.g., after 2018)
 - Convenience (e.g., restaurants with 100+ reviews)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.svm import SVC

import numpy as np
import pandas as pd


In [2]:

b_pandas = []
r_dtypes = {"stars": np.float16, 
            "useful": np.int32, 
            "funny": np.int32,
            "cool": np.int32,
           }

# with open("yelp_data/yelp_academic_dataset_review.json", "r") as f:
#     df = pd.read_json(f, orient="records", lines=True, dtype=r_dtypes)


KeyboardInterrupt: 

In [31]:
with open("yelp_data/yelp_academic_dataset_business.json", "r") as f:
    biz_df = pd.read_json(f, orient="records", lines=True)

In [33]:
biz_df.shape

(150346, 14)

In [97]:
biz_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [65]:
biz100_df = biz_df[biz_df['review_count'] > 99]
chinese_df = biz_df[biz_df['categories'].apply(lambda x: 'Chinese' in x if not pd.isnull(x) else False)]
# tx_df       = biz_df[biz_df['state'] == 'TX']

In [69]:
biz100_ids = set(biz100_df['business_id'].values)
chinese_ids = set(chinese_df['business_id'].values)

In [62]:
chinese_df.shape

(3343, 14)

In [61]:
biz_100p_df.shape

(14647, 14)

In [34]:
# Use chunksize for reading reviews

reviews_df = []

with open("yelp_data/yelp_academic_dataset_review.json", "r") as f:
    reader = pd.read_json(f, orient="records", lines=True, 
                          dtype=r_dtypes, chunksize=1000)
        
    for chunk in reader:
        reduced_chunk = chunk.drop(columns=['review_id', 'user_id'])\
                             .query("`date` >= '2020-03-15'")
        reviews_df.append(reduced_chunk)
    
reviews_df = pd.concat(reviews_df, ignore_index=True)


In [63]:
reviews_df.shape

(1039948, 7)

In [39]:
reviews_df.head()

Unnamed: 0,business_id,stars,useful,funny,cool,text,date
0,S2Ho8yLxhKAa26pBAm6rxA,2.0,0,0,0,We were looking for a good creole dinning expe...,2020-03-15 17:43:53
1,LHITV6jek0oe17xNhpmKNQ,1.0,0,0,0,"i went to get a chicken sandwich, and being al...",2020-05-04 01:57:38
2,k5WBbvefZdisYbgcswiKNA,2.0,0,0,0,The most bland pizza I've ever eaten in my lif...,2020-05-03 01:28:24
3,MFt52xaxxNQ2TiRyaDJnsQ,5.0,0,0,0,Took both my pups for a way ovetdue grooming. ...,2020-05-11 01:18:16
4,PmYGC8cDOIYRV11MIFsG3g,1.0,0,0,0,This is 100% based on the old guy that's someh...,2020-03-23 22:42:16


In [70]:
%%time
# Extract reviews for other cohorts
biz100_reviews = []

with open("yelp_data/yelp_academic_dataset_review.json", "r") as f:
    reader = pd.read_json(f, orient="records", lines=True, 
                          dtype=r_dtypes, chunksize=1000)
        
    for chunk in reader:
        reduced_chunk = chunk.drop(columns=['review_id', 'user_id'])\
                             .query("`business_id` in @biz100_ids")
        biz100_reviews.append(reduced_chunk)
    
biz100_reviews = pd.concat(biz100_reviews, ignore_index=True)


In [72]:
biz100_reviews.shape

(4009374, 7)

In [73]:
%%time
# Chinese reviews
chinese_reviews = []

with open("yelp_data/yelp_academic_dataset_review.json", "r") as f:
    reader = pd.read_json(f, orient="records", lines=True, 
                          dtype=r_dtypes, chunksize=1000)
        
    for chunk in reader:
        reduced_chunk = chunk.drop(columns=['review_id', 'user_id'])\
                             .query("`business_id` in @chinese_ids")
        chinese_reviews.append(reduced_chunk)
    
chinese_reviews = pd.concat(chinese_reviews, ignore_index=True)


In [74]:
chinese_reviews.shape

(224265, 7)

In [40]:
reviews_df['stars'].value_counts()

5.0    559597
1.0    210696
4.0    132057
3.0     69695
2.0     67903
Name: stars, dtype: int64

## 2. Run Models

- N1: `reviews_df` (reviews after 2020-03)
- N2: `chinese_reviews` (Chinese restaurants)
- N3: `biz100_reviews` (reviews from businesses with over 100 reviews)

In [42]:
X = reviews_df['text'].values
y = (reviews_df['stars'] > 3).astype(int)

In [43]:
N_datapts = 1000
source = (X[:N_datapts],y[:N_datapts])
ref = (X[N_datapts:2*N_datapts],y[N_datapts:2*N_datapts])
gen = (X[N_datapts:2*N_datapts],y[N_datapts:2*N_datapts])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [92]:
def get_Xy(df):
    X = df['text'].values
    y = (df['stars'] > 3).astype(int).values
    return (X,y)

def subsample(tup,N):
    X, y = tup
    N_total = X.shape[0]
    if N > N_total:
        raise ValueError('Cannot subsample lower than data size')
        
    chosen_idx = np.random.choice(np.arange(N_total), N)
    new_tup = (X[chosen_idx], y[chosen_idx])
    return new_tup

In [93]:
source = get_Xy(reviews_df)
ref = get_Xy(chinese_reviews)
gen = get_Xy(biz100_reviews)

In [None]:
n1 = get_Xy(reviews_df)
n2 = get_Xy(chinese_reviews)
n3 = get_Xy(biz100_reviews)

In [None]:
def run_svm(source, ref, gen):
    """
    source: source X and y
    ref: reference X and y
    gen: generalizing X and y
    """
    X_s, y_s = source
    X_r, y_r = ref
    X_g, y_g = gen
    
    vec = TfidfVectorizer()
    X_vec_s = vec.fit_transform(X_s)
    X_vec_r = vec.transform(X_r)
    X_vec_g = vec.transform(X_g)
    
    # learn tf idf on the training data, report performance on ref and gen
    clf = SVC(C=1., probability=True)
    clf.fit(X_vec_s,y_s)
    
    y_s_hat = clf.predict_proba(X_vec_s)[:,1]
    y_r_hat = clf.predict_proba(X_vec_r)[:,1]
    y_g_hat = clf.predict_proba(X_vec_g)[:,1]
    
#     auc_score_ref = roc_auc_score(y_r,y_r_hat)
    acc_score_source = accuracy_score(y_s,y_s_hat > 0.5)
    acc_score_ref = accuracy_score(y_r,y_r_hat > 0.5)
    acc_score_gen = accuracy_score(y_g,y_g_hat > 0.5)
    
    return (acc_score_source,acc_score_ref,acc_score_gen)

### 2.1 Single Source

In [99]:
source_i = subsample(source, 1000)
results = run_svm(source_i, source_i, source_i)
print(results)

(0.999, 0.999, 0.999)


In [100]:
source_i = subsample(source, 10000)
results = run_svm(source_i, source_i, source_i)
print(results)

(0.9937, 0.9937, 0.9937)


In [None]:
source_i = subsample(source, 100000)
results = run_svm(source_i, source_i, source_i)
print(results)

### 2.2 Multi-Source B

In [None]:
%%time

n1_i = subsample(n1, 1000)
n2_i = subsample(n2, 1000)
n3_i = subsample(n3, 1000)

X_source = np.concatenate([n1_i[0],n2_i[0],n3_i[0]])
y_source = np.concatenate([n1_i[1],n2_i[1],n3_i[1]])
source = (X_source, y_source)

ref = n1_i

n1_j = subsample(n1, 10000)
n2_j = subsample(n2, 10000)
n3_j = subsample(n3, 10000)

X_gen = np.concatenate([n1_j[0],n2_j[0],n3_j[0]])
y_gen = np.concatenate([n1_j[1],n2_j[1],n3_j[1]])
gen = (X_gen, y_gen)


results = run_svm(source, ref, gen)
print(results)

### 2.3 Multi-Source A

In [None]:
n1_i = subsample(n1, 1000)
# n2_i = subsample(n2, 1000)
# n3_i = subsample(n3, 1000)

# X_source = np.concatenate([n1_i[0],n2_i[0],n3_i[0]])
# y_source = np.concatenate([n1_i[1],n2_i[1],n3_i[1]])
source = n1_i

ref = n1_i

n1_j = subsample(n1, 10000)
n2_j = subsample(n2, 10000)
n3_j = subsample(n3, 10000)

X_gen = np.concatenate([n1_j[0],n2_j[0],n3_j[0]])
y_gen = np.concatenate([n1_j[1],n2_j[1],n3_j[1]])
gen = (X_gen, y_gen)

results = run_svm(source, ref, gen)
print(results)

#

In [95]:
results

(0.999, 0.999, 0.999)

In [None]:
source = subsample(source, 1000)
run_svm(source, ref, source)

In [45]:
%%time
run_svm(source, ref, source)

0.896
0.9507488880875105
CPU times: user 3.52 s, sys: 15.4 ms, total: 3.54 s
Wall time: 3.54 s
