Other resources:
 - [Other people using Yelp data](https://blog.michaelckennedy.net/2017/06/21/yelp-reviews-authorship-attribution-with-python-and-scikit-learn/)

In [79]:
import numpy as np
import pandas as pd
from scipy import sparse

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

pd.set_option("display.max_rows", None)


# 0. Load and Clean Data

We will concatenate the CSVs (instead of joining raw data) to cut down on RAM compute needs.

In [None]:
def run_yelp_exp_prep(source_dir, min_yr, max_yr):
    '''
    Takes in directory of data sources csv files and outputs formatted X, Y for model training.
    '''
    
    df = pd.DataFrame()

    for yr in yr_range:
        #/Users/rajiinio/Documents/more-data-more-problems/mdmp_data_clean/2005_2004_final_dd.csv
        source_file = "/%d_%d_final_dd.csv" %(yr+1, yr)
        source_path = source_dir+source_file
        df_yr = pd.read_csv(source_path)
    
        #clean up
        df_yr["year"] = yr
    
        #TO DO: properly clean categories 
        #df_yr['categories_lst'] = df_yr['categories'].apply(lambda x: x.split(', '))
        df = df.append(df_yr)
        #df = pd.concat([df, df_yr])
    
    
    tab_cols = ['stars_x','useful', 'funny', 'cool']

    vec = TfidfVectorizer()
    X_text_vec = vec.fit_transform(df['text']).tocsr()
    X_tab = df[tab_cols].values
    X_tab = X_tab.astype(float)
    #X = sparse.hstack((X_text_vec, X_tab)).tocsr()
    X = sparse.hstack((X_text_vec, X_tab)).tocsr()

    y = (df['stars_y'].values >= 3.).astype(int)
    
    
    return X, y

In [89]:
#df1 = pd.read_csv('yelp/2006_2005_final_dd.csv')
#df2 = pd.read_csv('yelp/2007_2006_final_dd.csv')

source_dir = "/Users/rajiinio/Documents/more-data-more-problems/mdmp_data_clean"
min_yr = 2004#2004
max_yr = 2006#2023
yr_range = list(range(min_yr, max_yr+1))

df = pd.DataFrame()

for yr in yr_range:
    #/Users/rajiinio/Documents/more-data-more-problems/mdmp_data_clean/2005_2004_final_dd.csv
    source_file = "/%d_%d_final_dd.csv" %(yr+1, yr)
    source_path = source_dir+source_file
    df_yr = pd.read_csv(source_path)
    
    #clean up
    df_yr["year"] = yr
    
    #TO DO: properly clean categories 
    #df_yr['categories_lst'] = df_yr['categories'].apply(lambda x: x.split(', '))
    df = df.append(df_yr)
    #df = pd.concat([df, df_yr])
    

In [81]:
df = df.sort_values('year') 

#This should be happening anyways (because of how we concat) but do this to make sure years are indexed in order

In [82]:
#df1['year'] = 2006
#df2['year'] = 2007

#df = pd.concat([df1,df2])

In [90]:
#print(df1.shape)
#print(df2.shape)

print(df.shape)

df

(4707, 22)


Unnamed: 0.1,Unnamed: 0,stars_x,useful,funny,cool,text,date,business_id,name,address,...,postal_code,latitude,longitude,stars_y,review_count,is_open,attributes,categories,hours,year
0,0,2.0,3,3,4,Hipsters unite! I've never had a good time at...,2005-07-10 17:20:44,2S_CEQYBbp8RO1bJm_1mng,The Saint,961 St Mary St,...,70130,29.93115,-90.07321,4.0,114,1,"{'Alcohol': ""u'full_bar'"", 'OutdoorSeating': '...","Nightlife, Dive Bars, Bars","{'Monday': '19:0-5:0', 'Tuesday': '19:0-5:0', ...",2005
1,1,4.0,0,0,0,Be seen where the hipsters and the wannabe-roc...,2005-07-10 16:41:28,2S_CEQYBbp8RO1bJm_1mng,The Saint,961 St Mary St,...,70130,29.93115,-90.07321,4.0,114,1,"{'Alcohol': ""u'full_bar'"", 'OutdoorSeating': '...","Nightlife, Dive Bars, Bars","{'Monday': '19:0-5:0', 'Tuesday': '19:0-5:0', ...",2005
2,2,2.0,2,0,0,Good luck finding anyone to help you. Poor po...,2005-05-18 15:44:55,0ZETsmrLSGWaBu1NNdcmYw,Office Depot,1024 Big Bend Blvd,...,63117,38.633743,-90.318239,2.5,15,1,"{'BusinessAcceptsCreditCards': 'True', 'Busine...","Printing Services, Local Services, Shopping, C...","{'Monday': '8:0-21:0', 'Tuesday': '8:0-21:0', ...",2005
3,3,3.0,4,0,0,Lots and lots of mostly fried seafood. Surpris...,2005-05-11 13:51:10,xJ3NSwE0xhdtA-tB_y_rNQ,Jack Dempsey's Restaurant & Bar,738 Poland Ave,...,70117,29.961515,-90.032744,4.0,168,1,"{'OutdoorSeating': 'False', 'Alcohol': ""u'full...","Seafood, Restaurants","{'Tuesday': '11:0-14:0', 'Wednesday': '11:0-20...",2005
4,4,5.0,0,0,0,Best casino in Reno. 4 deck and 2 deck black...,2005-05-15 00:39:29,PY9GRfzr4nTZeINf346QOw,Peppermill Reno,2707 S Virginia St,...,89502,39.497687,-119.801139,4.0,2486,1,"{'RestaurantsPriceRange2': '2', 'ByAppointment...","Event Planning & Services, Casinos, Beauty & S...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",2005
5,5,5.0,1,0,1,"Nice rooms, super-gaudy casino floor, good res...",2005-12-06 02:54:16,PY9GRfzr4nTZeINf346QOw,Peppermill Reno,2707 S Virginia St,...,89502,39.497687,-119.801139,4.0,2486,1,"{'RestaurantsPriceRange2': '2', 'ByAppointment...","Event Planning & Services, Casinos, Beauty & S...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",2005
6,6,3.0,0,0,0,Katie's is a neighborhood restaurant that has ...,2005-07-10 21:25:46,pym7c6ZFEtmoH16xN2ApBg,Katie's Restaurant & Bar,3701 Iberville St,...,70119,29.973094,-90.095914,4.5,1289,1,"{'RestaurantsGoodForGroups': 'True', 'Alcohol'...","Restaurants, Bars, Nightlife, American (Tradit...","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...",2005
7,7,3.0,0,0,0,"If you want beignets in Metarie, for my money ...",2005-07-10 06:00:20,xkgbgwTlkhzwo4ya_eIW-g,Cafe Du Monde,"3301 Veterans Memorial Blvd, Ste 104",...,70002,30.005917,-90.157363,4.0,127,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Coffee & Tea, Food, Donuts, Restaurants","{'Monday': '7:0-21:0', 'Tuesday': '7:0-21:0', ...",2005
8,8,4.0,0,0,1,"A very sweet find on the main drag, State Stre...",2005-09-15 04:13:52,GgcvRnt5_z3NEC0D6vNncQ,Cafe Buenos Aires,1316 State St,...,93101,34.424829,-119.706032,3.5,56,0,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Restaurants, Argentine","{'Monday': '11:30-20:30', 'Tuesday': '11:30-20...",2005
9,9,4.0,0,0,1,"Delicious sandwiches, great pizzas, decent sid...",2005-05-11 04:16:34,I13c2-Yo7hn2BKwCI81HSQ,Bellacino's Pizza & Grinders,4260 Hampton Ave,...,63109,38.59159,-90.29389,3.5,72,1,"{'Alcohol': ""u'none'"", 'OutdoorSeating': 'True...","Pizza, Salad, Sandwiches, Restaurants, Italian","{'Monday': '0:0-0:0', 'Tuesday': '11:0-20:0', ...",2005


In [38]:
#df.shape

In [39]:
# df1 = df1.sort_values('date')
# df2 = df2.sort_values('date')

In [40]:
# df['slice'] -> ['2006', '2007']

In [41]:
# df2.head()

In [42]:
#df1['categories_lst'] = df1['categories'].apply(lambda x: x.split(', '))

In [43]:
# How do we encode groups?

#TO DO: encode subgroups

ethnic_categories = [
'American (Traditional)',
    'American (New)',
    'Italian',
    'Mexican',
    'Japanese',
    'Chinese',
    'Southern',
    'Vietnamese',
    'Asian Fusion',
    'Mediterranean',
    'Thai'
]

# TODO: figure out how this works in the eval step
def is_subgroup(s,category):
    return category in row[s]

Copied from [this Jupyter notebook](https://github.com/ahegel/yelp-dataset/blob/master/Predicting%20Star%20Ratings.ipynb) but couldn't get it working

In [44]:
# TODO: Add categories and attributes

# df1['categories_clean'] = map(lambda x: '|'.join(x), df1['categories'])
# df1 = df1.categories_clean.str.get_dummies(sep='|')
# # merge
# # business_df = business_df.merge(categories_df, left_index=True, right_index=True)
# # remove intermediate columns (no longer needed)
# df1.drop(['categories', 'categories_clean'], axis=1, inplace=True)
# df1.head()


In [45]:
# business_df = business_df.join(pd.DataFrame(business_df['attributes'].to_dict()).T)
# # further split sub-attributes into their own columns
# cols_to_split = ['BusinessParking', 'Ambience', 'BestNights', 'GoodForMeal', 'HairSpecializesIn', 'Music']
# for col_to_split in cols_to_split:
#     new_df = pd.DataFrame(business_df[col_to_split].to_dict()).T
#     new_df.columns = [col_to_split + '_' + str(col) for col in new_df.columns]
#     business_df = business_df.join(new_df)

# business_df.drop(['attributes'] + cols_to_split, axis=1, inplace=True)
# business_df.head()


In [93]:
print(X_text_vec.shape)

print(X_tab.shape)
#print(X_text_vec, X_tab)

X = df[tab_cols].values
X.astype(float)

(4707, 20205)
(4707, 4)


array([[2., 3., 3., 4.],
       [4., 0., 0., 0.],
       [2., 2., 0., 0.],
       ...,
       [3., 2., 0., 2.],
       [2., 1., 0., 0.],
       [1., 1., 8., 1.]])

In [94]:
X_text_vec

<4707x20205 sparse matrix of type '<class 'numpy.float64'>'
	with 305060 stored elements in Compressed Sparse Row format>

In [95]:

tab_cols = ['stars_x','useful', 'funny', 'cool']

vec = TfidfVectorizer()
X_text_vec = vec.fit_transform(df['text']).tocsr()
X_tab = df[tab_cols].values
X_tab = X_tab.astype(float)
#X = sparse.hstack((X_text_vec, X_tab)).tocsr()
X = sparse.hstack((X_text_vec, X_tab)).tocsr()

y = (df['stars_y'].values >= 3.).astype(int)


#sparse.hstack(X, A) # error
#sparse.hstack(X.astype(object), A) # cast X to object; return object
#sparse.hstack(X, A.astype(float)) # cast A to float; return float
#hstack(X.A, A) # make X dense, result will be type object


#tab_cols = ['stars_x','useful', 'funny', 'cool']

#vec = TfidfVectorizer()
##X_text_vec = vec.fit_transform(df['text']).tocsr()
#X_tab = df[tab_cols].values
#X = sparse.hstack((X_text_vec, X_tab)).tocsr()

#y = (df['stars_y'].values >= 3.).astype(int)

# 1. Modeling!

In [None]:
# for S1: subsample N from 0 to 1000; train on that and report test perf
# 
# add S2 and learn for S1 + S2[:100], S1 + S2[:200]
# continue for all slices

## 1.1 Using actual Yelp dates

In [105]:
#df.shape

(4707, 22)

In [93]:
#print(df1.shape, df2.shape)
#print(854+3853)

(854, 22) (3853, 21)
4707


In [92]:
#np.linspace(0, 854,4)

array([  0.        , 284.66666667, 569.33333333, 854.        ])

In [91]:
#np.linspace(854,4707,4)

array([ 854.        , 2138.33333333, 3422.66666667, 4707.        ])

In [94]:
#np.arange(0,4707,250)

array([   0,  250,  500,  750, 1000, 1250, 1500, 1750, 2000, 2250, 2500,
       2750, 3000, 3250, 3500, 3750, 4000, 4250, 4500])

In [97]:
year_counts_dict = df['year'].value_counts().to_dict()
year_counts_dict

year_idx_dict = {year: np.where(df['year'].values == year)[0] for year in df['year'].unique()}

In [98]:
#num = 900
#ignore_idx = [0,4,2,100]
#year_idx_dict_ignore = {k:[i for i in v if i not in ignore_idx] for k,v in year_idx_dict.items()}

In [99]:
# within avail idx, within year, then sample
# train_set_idx = []





def get_avail_idx(num, year_idx_dict, ignore_idx, avail_years):
    """
    input:
     - num: int of samples needed
     - year_idx_dict: dict of {year: list of idx} for the df
     - ignore_idx: list of idx to ignore
     - avail_years: list of years that are considered
    
    output:
     - avail_dict: dict of {num to sample: list of valid idx}
    """
    year_idx_dict_ignore = {k:[i for i in v if i not in ignore_idx] for k,v in year_idx_dict.items()}
    avail_dict = {}
    remaining = num
    for year in sorted(year_idx_dict_ignore.keys()):
        if remaining > 0:
            avail_year_idx = year_idx_dict_ignore[year]
            avail_year_num_idx = len(avail_year_idx)
            
            use_this_year = min(remaining, avail_year_num_idx)
            avail_dict[use_this_year] = avail_year_idx
            remaining -= avail_year_num_idx
    return avail_dict

def get_sample_avail_idx(avail_dict):
    avail_idx_lst = list()
    
    for num_to_sample, lst_to_sample in avail_dict.items():
        sample_idx = np.random.choice(lst_to_sample, size=num_to_sample, replace=False)
        avail_idx_lst.append(sample_idx)
        
    avail_idx = np.concatenate(avail_idx_lst)
    return avail_idx

In [100]:
#test = get_avail_idx(1000, year_idx_dict, [0,1,100],avail_years=[2006,2007])
#test.keys()

In [101]:
#type(X)

In [102]:
# held out set of 20% test
# train_size = 100, 200, 300, 400, 500, 600 (4 chunks)

# reference test_set = 100 random points from latest slice
# generalized test_set = 100 random points from total dataset
# source test_set = 100 random points from each training run

test_set_size = 100
N = df.shape[0]

# ref: take 100 from df
max_year = max(df['year'].values)
ref_test_idx = np.random.choice(year_idx_dict[max_year], size=test_set_size, replace=False)

# gen: take 100 from df1 and df2
gen_test_idx = np.random.choice(range(N), size=test_set_size, replace=False)


step_size = 250




for train_N in [2000]:
# for train_N in np.arange(step_size, N,step_size):
    ignore_idx = np.concatenate((ref_test_idx,gen_test_idx))
    total_idx_sample_dict = get_avail_idx(train_N+test_set_size, year_idx_dict, ignore_idx, avail_years=[2006, 2007])
    total_idx = get_sample_avail_idx(total_idx_sample_dict)
    
    train_idx = total_idx[test_set_size:]
    
    # source: take 100 from each train run
    source_test_idx = total_idx[:test_set_size]
    
    X_train = X[train_idx].toarray()
    y_train = y[train_idx]
    
    X_source, y_source = X[source_test_idx].toarray(), y[source_test_idx]
    X_ref, y_ref = X[ref_test_idx].toarray(), y[ref_test_idx]
    X_gen, y_gen = X[gen_test_idx].toarray(), y[gen_test_idx]
    
    model = make_pipeline(StandardScaler(), LogisticRegression())
    model.fit(X_train, y_train)

    yhat_source = model.predict(X_source)
    yhat_ref = model.predict(X_ref)
    yhat_gen = model.predict(X_gen)
    
    acc_source = accuracy_score(yhat_source, y_source)
    acc_ref = accuracy_score(yhat_ref, y_ref)
    acc_gen = accuracy_score(yhat_gen, y_gen)
    
    results = {
        'acc_source': acc_source,
        'acc_ref': acc_ref,
        'acc_gen': acc_gen,
        'source_test_idx': source_test_idx,
        'gen_test_idx': gen_test_idx,
        'ref_test_idx': ref_test_idx,
        'yhat_source': yhat_source,
        'yhat_ref': yhat_ref,
        'yhat_gen': yhat_gen,
        'y_source': y_source,
        'y_ref': y_ref,
        'y_gen': y_gen
    }
    

In [103]:
print(acc_source, acc_ref, acc_gen)

0.94 0.91 0.94


## 1.2 Using subsampled random slices