# IEOR 4571 - Personalization - Final Project

#### Team members: 
Name, UNI/email, Github ID
* Megala Kannan, msk2245@columbia.edu, thisismeg
* Hojin Lee, hl3328@columbia.edu, hjlee9295
* Jung Ah Shin, js5569@columbia.edu, juliajungahshin
* Tiffany Zhu, tz2196@columbia.edu, tlzhu19


# TOC:
* [1. Introduction](#1)
* [2. Data Exploration](#2)
* [3. Modeling](#3)
    * [3.1 Baseline Model](#3-1)
    * [3.2 Something](#3-2)
* [4. Evaluation](#4)
    * [4.1 Accuracy](#4-1)
    * [4.2 Coverage](#4-2)
* [5. Conclusion](#5)


# 1. Introduction <a class="anchor" id="1"></a>

# 2. Data Exploration <a class="anchor" id="2"></a>

In [1]:
import pandas as pd
import json
from tqdm import tqdm

In [2]:
def convert_json_to_df(path, file_name, column_names):
    line_count = len(open(path + file_name).readlines())    
    columns_dict = {name: [] for name in column_names}

    with open(path + file_name) as f:
        for line in tqdm(f, total=line_count):
            blob = json.loads(line)
            
            for key in column_names:
                columns_dict[key].append(blob[key])
    
    return pd.DataFrame(columns_dict)

In [3]:
path = "/Users/hojinlee/Documents/Columbia/2019/Class/Personalization/HW/final/yelp_dataset/"
file_name = "review.json"

In [4]:
# review.json
ratings = convert_json_to_df(path, file_name, ['user_id', 'business_id', 'stars', 'date'])

user_counts = ratings["user_id"].value_counts()
active_users = user_counts.loc[user_counts >= 5].index.tolist()

100%|██████████| 6685900/6685900 [00:58<00:00, 115079.20it/s]


In [5]:
ratings.rename(columns={'stars': 'rating'}, inplace=True)
ratings.head()

Unnamed: 0,user_id,business_id,rating,date
0,hG7b0MtEbXx5QzbzE6C_VA,ujmEBvifdJM6h6RLv4wQIg,1.0,2013-05-07 04:34:36
1,yXQM5uF2jS6es16SJzNHfg,NZnhc2sEQy3RmzKTZnqtwQ,5.0,2017-01-14 21:30:33
2,n6-Gk65cPZL6Uz8qRm3NYw,WTqjgwHlXbSFevF32_DJVw,5.0,2016-11-09 20:09:03
3,dacAIZ6fTM6mqwW5uxkskg,ikCg8xy5JIg_NGPx-MSIDA,5.0,2018-01-09 20:56:38
4,ssoyf2_x0EQMed6fgHeMyQ,b1b1eb3uo-w561D0ZfCEiQ,1.0,2018-01-30 23:07:38


In [6]:
# can do the same for business.json, user.json, tip.json 
# for metadata info see https://www.yelp.com/dataset/documentation/main

In [24]:
# business.json
# todo: how to add 'attributes'?
'''
"attributes": {
        "RestaurantsTakeOut": true,
        "BusinessParking": {
            "garage": false,
            "street": true,
            "validated": false,
            "lot": false,
            "valet": false
        },
'''

businesses = convert_json_to_df(path, 'business.json', ['business_id', 'city', 'state', 'stars', 
                                                        'review_count', 'is_open', 'attributes', 
                                                        'categories', 'hours'])

100%|██████████| 192609/192609 [00:03<00:00, 59708.34it/s]


In [25]:
businesses.head()

Unnamed: 0,business_id,city,state,stars,review_count,is_open,attributes,categories,hours
0,1SWheh84yJXfytovILXOAQ,Phoenix,AZ,3.0,5,0,{'GoodForKids': 'False'},"Golf, Active Life",
1,QXAEGFB4oINsVuTFxEYKFQ,Mississauga,ON,2.5,128,1,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...","{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W..."
2,gnKjwL_1w79qoiV3IC_xQQ,Charlotte,NC,4.0,170,1,"{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...","Sushi Bars, Restaurants, Japanese","{'Monday': '17:30-21:30', 'Wednesday': '17:30-..."
3,xvX2CttrVhyG2z1dFg_0xw,Goodyear,AZ,5.0,3,1,,"Insurance, Financial Services","{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ..."
4,HhyxOkGAM07SRYtlQ4wMFQ,Charlotte,NC,4.0,4,1,"{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...","Plumbing, Shopping, Local Services, Home Servi...","{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ..."


In [8]:
# user.json
users = convert_json_to_df(path, 'user.json', ['user_id', 'review_count', 'friends', 'useful', 
                                               'funny', 'cool', 'fans', 'elite', 'average_stars', 
                                               'compliment_hot', 'compliment_more', 'compliment_profile',
                                               'compliment_cute', 'compliment_list', 'compliment_note',
                                               'compliment_plain', 'compliment_cool', 'compliment_funny',
                                               'compliment_writer', 'compliment_photos'
                                              ])

100%|██████████| 1637138/1637138 [00:26<00:00, 62750.12it/s]


In [9]:
# tip.json
tips =  convert_json_to_df(path, 'tip.json', ['text', 'date', 'compliment_count', 'business_id', 'user_id'])

100%|██████████| 1223094/1223094 [00:06<00:00, 187052.47it/s]


In [10]:
users.head()

Unnamed: 0,user_id,review_count,friends,useful,funny,cool,fans,elite,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,l6BmjZMeQD3rDxWUbiAiow,95,"c78V-rj8NQcQjOI8KP3UEA, alRMgPcngYSCJ5naFRBz5g...",84,17,25,5,201520162017.0,4.03,2,0,0,0,0,1,1,1,1,2,0
1,4XChL029mKr5hydo79Ljxg,33,"kEBTgDvFX754S68FllfCaA, aB2DynOxNOJK9st2ZeGTPg...",48,22,16,4,,3.63,1,0,0,0,0,0,0,1,1,0,0
2,bc8C_eETBWL0olvFSJJd0w,16,"4N-HU_T32hLENLntsNKNBg, pSY2vwWLgWfGVAAiKQzMng...",28,8,10,0,,3.71,0,0,0,0,0,1,0,0,0,0,0
3,dD0gZpBctWGdWo9WlGuhlA,17,"RZ6wS38wnlXyj-OOdTzBxA, l5jxZh1KsgI8rMunm-GN6A...",30,4,14,5,,4.85,1,0,0,0,0,0,2,0,0,1,0
4,MM4RJAeH6yuaN8oZDSt0RA,361,"mbwrZ-RS76V1HoJ0bF_Geg, g64lOV39xSLRZO0aQQ6DeQ...",1114,279,665,39,2015201620172018.0,4.08,28,1,0,0,1,16,57,80,80,25,5


In [11]:
len(users[users['review_count']>=5])

861695

In [12]:
len(ratings) #6685900
ratings.head()

Unnamed: 0,user_id,business_id,rating,date
0,hG7b0MtEbXx5QzbzE6C_VA,ujmEBvifdJM6h6RLv4wQIg,1.0,2013-05-07 04:34:36
1,yXQM5uF2jS6es16SJzNHfg,NZnhc2sEQy3RmzKTZnqtwQ,5.0,2017-01-14 21:30:33
2,n6-Gk65cPZL6Uz8qRm3NYw,WTqjgwHlXbSFevF32_DJVw,5.0,2016-11-09 20:09:03
3,dacAIZ6fTM6mqwW5uxkskg,ikCg8xy5JIg_NGPx-MSIDA,5.0,2018-01-09 20:56:38
4,ssoyf2_x0EQMed6fgHeMyQ,b1b1eb3uo-w561D0ZfCEiQ,1.0,2018-01-30 23:07:38


In [13]:
import datetime as dt

ratings['date'] = pd.to_datetime(ratings['date']).dt.date

#time series
df = ratings.merge(businesses[['business_id','state']], how='left', right_on='business_id', left_on='business_id')
df['month'] = df['date'].apply(lambda x: x.strftime('%Y%m'))
time_series_by_state = df[['state','rating','month']].groupby(['state','month']).mean().reset_index()
df = time_series_by_state.pivot(index='month', columns='state', values='rating')
df.plot()



<matplotlib.axes._subplots.AxesSubplot at 0x1b1daf3c8>

In [14]:
sns.lineplot(data=time_series_by_state, x='month', y='rating', hue='state')

NameError: name 'sns' is not defined

In [None]:
sns.countplot(x='stars',data=businesses);

In [15]:
# uneven distribution of data among states -> cannot apply Location Aware Recommender System.
groupby_state = businesses[['state','business_id']].groupby('state').count().reset_index().sort_values(by='business_id',ascending=False)

plt.figure(figsize=(12,6))
sns.countplot(x='state',data=businesses);

NameError: name 'plt' is not defined

In [None]:
sns.distplot(businesses['review_count'].apply(np.log1p))

In [17]:
import numpy as np
active_user_only_ratings_df = ratings[ratings['user_id'].isin(active_users)]

sample_size = [50000,60000,100000]

for i in range(5):
    print(i)
    
    for s in sample_size:
        sampleUID = active_user_only_ratings_df['user_id'].unique()[np.random.randint(active_user_only_ratings_df['user_id'].unique().shape[0], size=s)]
        active_user_only_ratings_df_sample = active_user_only_ratings_df[active_user_only_ratings_df['user_id'].isin(sampleUID)]

        print('sample size', s)
        print('data set size', len(active_user_only_ratings_df_sample))
        print('-----------')
    


0
sample size 50000
data set size 725029
-----------
sample size 60000
data set size 854016
-----------
sample size 100000
data set size 1338828
-----------
1
sample size 50000
data set size 727395
-----------
sample size 60000
data set size 851225
-----------
sample size 100000
data set size 1341707
-----------
2
sample size 50000
data set size 720506
-----------
sample size 60000
data set size 862425
-----------
sample size 100000
data set size 1349914
-----------
3
sample size 50000
data set size 714678
-----------
sample size 60000
data set size 870414
-----------
sample size 100000
data set size 1345642
-----------
4
sample size 50000
data set size 738344
-----------
sample size 60000
data set size 849873
-----------
sample size 100000
data set size 1341164
-----------


In [None]:
### baby data set Test

In [18]:
active_user_only_ratings_df = ratings[ratings['user_id'].isin(active_users)]

sample_size = [100]

for s in sample_size:
    sampleUID = active_user_only_ratings_df['user_id'].unique()[np.random.randint(active_user_only_ratings_df['user_id'].unique().shape[0], size=s)]
    active_user_only_ratings_df_sample = active_user_only_ratings_df[active_user_only_ratings_df['user_id'].isin(sampleUID)]



In [118]:
import itertools
from collections import Counter

base_df = active_user_only_ratings_df_sample

#sparse alert - hopefully with bigger dataset, we will see some weird ratings..
real_average = base_df[['business_id','rating']].groupby('business_id').mean().reset_index()
real_average.rename(columns={"rating": "average_business_rating"}, inplace=True)

#Average ratings for business added
base_df = base_df.merge(real_average, how='left', on='business_id')

#Adding state, review_count, is_open
base_df = base_df.merge(businesses[['business_id','state','review_count','is_open', 'hours','cat']],  on='business_id')
base_df.rename(columns={"review_count": "business_review_count"}, inplace=True)

#one-hot encoding for top5 categories
catList = []
businesses['categories'].fillna(value='',inplace=True)
businesses['cat'] = businesses['categories'].apply(lambda x: x.split(','))
catList.extend(businesses['cat'])
merged = [x.strip() for x in list(itertools.chain(*catList))]

#getting top 5 common categories items
top5List = [x for x in list(itertools.chain(*Counter(merged).most_common(5))) if type(x) != int]

#one-hot encoding if the business in top 5 common category
for item in top5List:
    base_df[item] = base_df['cat'].apply(lambda categories: 'Y' if bool(set([y.strip() for y in categories]).intersection([item])) else 'N')
    #base_df[item] = base_df['cat'].apply(lambda categories: 'Y' if bool(set([y.strip() for y in categories]).intersection(set(item))) else 'N')

# is_open (categorical) change from 1 and 0 to Y and N
base_df['is_open'] = base_df['is_open'].apply(lambda x: 'Y' if x else 'N')

# hours: how many days per week it's open
base_df['hours'] = base_df['hours'].apply(lambda x: len(x.keys()) if x else 0)
base_df.rename(columns={"hours": "days_per_week_open"}, inplace=True)

# user information
base_df = base_df.merge(users[['user_id', 'average_stars', 'review_count', 'friends']],  on='user_id')

# number_of_friends
base_df['friends'] = base_df['friends'].apply(lambda x: len(x.split(',')))
base_df.rename(columns={"friends": "number_of_friends", "review_count": "user_review_count", "average_stars": "average_user_rating"}, inplace=True)


In [119]:
base_df.head()

Unnamed: 0,user_id,business_id,rating,date,average_business_rating,state,business_review_count,is_open,days_per_week_open,cat,Restaurants,Shopping,Food,Home Services,Beauty & Spas,average_user_rating,user_review_count,number_of_friends
0,QyrV8IXzeCBOqS4Pj2DGUQ,bD46Yt0A_zToPPPE3Lv1cw,4.0,2015-05-27,4.0,QC,56,Y,7,"[Pizza, Restaurants, American (Traditional),...",Y,N,Y,N,N,3.91,47,66
1,QyrV8IXzeCBOqS4Pj2DGUQ,EmGHlNoXtpoOovmIchP3aw,5.0,2015-05-13,5.0,QC,36,Y,6,"[Spanish, Breakfast & Brunch, Restaurants]",Y,N,N,N,N,3.91,47,66
2,QyrV8IXzeCBOqS4Pj2DGUQ,GoXOq5SegVlAfaTwtDGTVQ,4.0,2015-05-25,4.0,QC,27,Y,6,"[Restaurants, Chinese]",Y,N,N,N,N,3.91,47,66
3,QyrV8IXzeCBOqS4Pj2DGUQ,TFYkVf814tT6gDUBoSNZRQ,5.0,2014-09-03,5.0,QC,60,Y,6,"[Restaurants, Tapas Bars, Spanish]",Y,N,N,N,N,3.91,47,66
4,QyrV8IXzeCBOqS4Pj2DGUQ,tCp1OkefhamXLeTnmEPUow,4.0,2015-05-13,4.0,QC,76,Y,7,"[Restaurants, Bistros, Brasseries, French, ...",Y,N,N,N,N,3.91,47,66


In [54]:
tips.head()

Unnamed: 0,text,date,compliment_count,business_id,user_id
0,"Great for watching games, ufc, and whatever el...",2014-03-27 03:51:24,0,VaKXUpmWTTWDKbpJ3aQdMw,UPw5DWs_b-e2JRBS-t37Ag
1,Happy Hour 2-4 daily with 1/2 price drinks and...,2013-05-25 06:00:56,0,OPiPeoJiv92rENwbq76orA,Ocha4kZBHb4JK0lOWvE0sg
2,Good chips and salsa. Loud at times. Good serv...,2011-12-26 01:46:17,0,5KheTjYPu1HcQzQFtm4_vw,jRyO2V1pA4CdVVqCIOPc1Q
3,The setting and decoration here is amazing. Co...,2014-03-23 21:32:49,0,TkoyGi8J7YFjA6SbaRzrxg,FuTJWFYm4UKqewaosss1KA
4,Molly is definately taking a picture with Sant...,2012-10-06 00:19:27,0,AkL6Ous6A1atZejfZXn1Bg,LUlKtaM3nXd-E4N4uOk_fQ


# 3. Modeling <a class="anchor" id="3"></a>

## 3.1 Baseline Model <a class="anchor" id="3-1"></a>

## 3.2 ? <a class="anchor" id="3-2"></a>

# 4. Evaluation <a class="anchor" id="4"></a>

## 4.1 Accuracy <a class="anchor" id="4-1"></a>

## 4.2 Coverage <a class="anchor" id="4-2"></a>

# 5. Conclusion <a class="anchor" id="5"></a>