Other resources:
 - [Other people using Yelp data](https://blog.michaelckennedy.net/2017/06/21/yelp-reviews-authorship-attribution-with-python-and-scikit-learn/)

In [162]:
import numpy as np
import pandas as pd
from scipy import sparse

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

pd.set_option("display.max_rows", None)


# 0. Load and Clean Data

We will concatenate the CSVs (instead of joining raw data) to cut down on RAM compute needs.

In [95]:
df1 = pd.read_csv('yelp/2006_2005_final_dd.csv')
df2 = pd.read_csv('yelp/2007_2006_final_dd.csv')

In [101]:
df1['year'] = 2006
df2['year'] = 2007

df = pd.concat([df1,df2])

df = df.sort_values('year')

In [84]:
print(df1.shape)
print(df2.shape)

(854, 22)
(3853, 21)


In [102]:
df.shape

(4707, 22)

In [78]:
# df1 = df1.sort_values('date')
# df2 = df2.sort_values('date')

In [None]:
# df['slice'] -> ['2006', '2007']

In [103]:
# df2.head()

In [7]:
df1['categories_lst'] = df1['categories'].apply(lambda x: x.split(', '))

In [27]:
# How do we encode groups?

ethnic_categories = [
'American (Traditional)',
    'American (New)',
    'Italian',
    'Mexican',
    'Japanese',
    'Chinese',
    'Southern',
    'Vietnamese',
    'Asian Fusion',
    'Mediterranean',
    'Thai'
]

# TODO: figure out how this works in the eval step
def is_subgroup(s,category):
    return category in row[s]

Copied from [this Jupyter notebook](https://github.com/ahegel/yelp-dataset/blob/master/Predicting%20Star%20Ratings.ipynb) but couldn't get it working

In [45]:
# TODO: Add categories and attributes

# df1['categories_clean'] = map(lambda x: '|'.join(x), df1['categories'])
# df1 = df1.categories_clean.str.get_dummies(sep='|')
# # merge
# # business_df = business_df.merge(categories_df, left_index=True, right_index=True)
# # remove intermediate columns (no longer needed)
# df1.drop(['categories', 'categories_clean'], axis=1, inplace=True)
# df1.head()


In [None]:
# business_df = business_df.join(pd.DataFrame(business_df['attributes'].to_dict()).T)
# # further split sub-attributes into their own columns
# cols_to_split = ['BusinessParking', 'Ambience', 'BestNights', 'GoodForMeal', 'HairSpecializesIn', 'Music']
# for col_to_split in cols_to_split:
#     new_df = pd.DataFrame(business_df[col_to_split].to_dict()).T
#     new_df.columns = [col_to_split + '_' + str(col) for col in new_df.columns]
#     business_df = business_df.join(new_df)

# business_df.drop(['attributes'] + cols_to_split, axis=1, inplace=True)
# business_df.head()


In [156]:

tab_cols = ['stars_x','useful', 'funny', 'cool']

vec = TfidfVectorizer()
X_text_vec = vec.fit_transform(df['text']).tocsr()
X_tab = df[tab_cols].values
X = sparse.hstack((X_text_vec, X_tab)).tocsr()

y = (df['stars_y'].values >= 3.).astype(int)

# 1. Modeling!

In [None]:
# for S1: subsample N from 0 to 1000; train on that and report test perf
# 
# add S2 and learn for S1 + S2[:100], S1 + S2[:200]
# continue for all slices

## 1.1 Using actual Yelp dates

In [105]:
df.shape

(4707, 22)

In [93]:
print(df1.shape, df2.shape)
print(854+3853)

(854, 22) (3853, 21)
4707


In [92]:
np.linspace(0, 854,4)

array([  0.        , 284.66666667, 569.33333333, 854.        ])

In [91]:
np.linspace(854,4707,4)

array([ 854.        , 2138.33333333, 3422.66666667, 4707.        ])

In [94]:
np.arange(0,4707,250)

array([   0,  250,  500,  750, 1000, 1250, 1500, 1750, 2000, 2250, 2500,
       2750, 3000, 3250, 3500, 3750, 4000, 4250, 4500])

In [112]:
year_counts_dict = df['year'].value_counts().to_dict()
year_counts_dict

year_idx_dict = {year: np.where(df['year'].values == year)[0] for year in df['year'].unique()}

In [127]:
num = 900
ignore_idx = [0,4,2,100]
year_idx_dict_ignore = {k:[i for i in v if i not in ignore_idx] for k,v in year_idx_dict.items()}

In [160]:
# within avail idx, within year, then sample
# train_set_idx = []

def get_avail_idx(num, year_idx_dict, ignore_idx, avail_years):
    """
    input:
     - num: int of samples needed
     - year_idx_dict: dict of {year: list of idx} for the df
     - ignore_idx: list of idx to ignore
     - avail_years: list of years that are considered
    
    output:
     - avail_dict: dict of {num to sample: list of valid idx}
    """
    year_idx_dict_ignore = {k:[i for i in v if i not in ignore_idx] for k,v in year_idx_dict.items()}
    avail_dict = {}
    remaining = num
    for year in sorted(year_idx_dict_ignore.keys()):
        if remaining > 0:
            avail_year_idx = year_idx_dict_ignore[year]
            avail_year_num_idx = len(avail_year_idx)
            
            use_this_year = min(remaining, avail_year_num_idx)
            avail_dict[use_this_year] = avail_year_idx
            remaining -= avail_year_num_idx
    return avail_dict

def get_sample_avail_idx(avail_dict):
    avail_idx_lst = list()
    
    for num_to_sample, lst_to_sample in avail_dict.items():
        sample_idx = np.random.choice(lst_to_sample, size=num_to_sample, replace=False)
        avail_idx_lst.append(sample_idx)
        
    avail_idx = np.concatenate(avail_idx_lst)
    return avail_idx

In [153]:
test = get_avail_idx(1000, year_idx_dict, [0,1,100],avail_years=[2006,2007])
test.keys()

dict_keys([851, 149])

In [155]:
type(X)

scipy.sparse.coo.coo_matrix

In [171]:
# held out set of 20% test
# train_size = 100, 200, 300, 400, 500, 600 (4 chunks)

# reference test_set = 100 random points from latest slice
# generalized test_set = 100 random points from total dataset
# source test_set = 100 random points from each training run

test_set_size = 100
N = df.shape[0]

# ref: take 100 from df
max_year = max(df['year'].values)
ref_test_idx = np.random.choice(year_idx_dict[max_year], size=test_set_size, replace=False)

# gen: take 100 from df1 and df2
gen_test_idx = np.random.choice(range(N), size=test_set_size, replace=False)


step_size = 250

for train_N in [2000]:
# for train_N in np.arange(step_size, N,step_size):
    ignore_idx = np.concatenate((ref_test_idx,gen_test_idx))
    total_idx_sample_dict = get_avail_idx(train_N+test_set_size, year_idx_dict, ignore_idx, avail_years=[2006, 2007])
    total_idx = get_sample_avail_idx(total_idx_sample_dict)
    
    train_idx = total_idx[test_set_size:]
    
    # source: take 100 from each train run
    source_test_idx = total_idx[:test_set_size]
    
    X_train = X[train_idx].toarray()
    y_train = y[train_idx]
    
    X_source, y_source = X[source_test_idx].toarray(), y[source_test_idx]
    X_ref, y_ref = X[ref_test_idx].toarray(), y[ref_test_idx]
    X_gen, y_gen = X[gen_test_idx].toarray(), y[gen_test_idx]
    
    model = make_pipeline(StandardScaler(), LogisticRegression())
    model.fit(X_train, y_train)

    yhat_source = model.predict(X_source)
    yhat_ref = model.predict(X_ref)
    yhat_gen = model.predict(X_gen)
    
    acc_source = accuracy_score(yhat_source, y_source)
    acc_ref = accuracy_score(yhat_ref, y_ref)
    acc_gen = accuracy_score(yhat_gen, y_gen)
    
    results = {
        'acc_source': acc_source,
        'acc_ref': acc_ref,
        'acc_gen': acc_gen,
        'source_test_idx': source_test_idx,
        'gen_test_idx': gen_test_idx,
        'ref_test_idx': ref_test_idx,
        'yhat_source': yhat_source,
        'yhat_ref': yhat_ref,
        'yhat_gen': yhat_gen,
        'y_source': y_source,
        'y_ref': y_ref,
        'y_gen': y_gen
    }
    

In [172]:
print(acc_source, acc_ref, acc_gen)

0.93 0.93 0.9


## 1.2 Using subsampled random slices