# Recommendation System for Hoboken Reviews(yelp)
### - Methodlogy: Basic method(3)+ Machine learning methods(5)
### - Objective: providing 3 more restaurants for each customers based on historical data.

In [None]:
#py2.7
#import graphlab 

In [None]:
#py3.6
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from statistics import mean
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
#%%python
#import nltk
#nltk.download()
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer

## I. Data EDA and Preparation

In [None]:
df=pd.read_csv('Hoboken_restaurants_reviews.csv')
df.head()

In [None]:
del df['Unnamed: 0']
del df['Unnamed: 0.1']

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().values.any()

In [None]:
df.isnull().any()

In [None]:
df[df.restaurant_price.isnull() == True]

Only some price value is missing since these value is not exist on the Yelp. So I will just remain them as NA.

#### data definition:
- user_id: Unique id for each customer
- user_name: customer's name
- user_raing: original rating for one restaurant per time
- user_text: customer's review for one restaurant per time
- restaurant_name: unique name for each restaurant
- restaurant_price: degree of cheap or expensive of one restaurant
- restaurant_type: the style and theme of one restaurant

### data processing one by one

#### user_id

In [None]:
user_id = pd.DataFrame(df.user_id.value_counts().head(10))
user_id.reset_index(level=0, inplace = True)
user_id.columns = ['user_id', 'count']
user_id

In [None]:
f, ax = plt.subplots(figsize=(10, 10))
sns.barplot(x='count', y='user_id', data=user_id, color="turquoise", ax = ax)
plt.show()

Multiple customers leave reviews for restaurants more than one times. It is valuable for the recommendation system.

#### user_name

In [None]:
user_name = pd.DataFrame(df.user_name.value_counts().head(10))
user_name.reset_index(level=0, inplace = True)
user_name.columns = ['user_name', 'count']
user_name

In [None]:
f, ax = plt.subplots(figsize=(10, 10))
sns.barplot(x='count', y='user_name', data=user_name, color="green", ax = ax)
plt.show()

In [None]:
user_name = pd.DataFrame(df.user_name.value_counts().tail(10))
user_name.reset_index(level=0, inplace = True)
user_name.columns = ['user_name', 'count']
user_name

Since different customers might have same name, in order to invoid error, I will not use user_id in the recommendation system.

### user_rating

In [None]:
df.user_rating.unique()

I would like to convert the string into integer format.

In [None]:
df.user_rating = df.user_rating.replace({'5.0 star rating':5,
                                         '4.0 star rating':4, 
                                         '3.0 star rating':3,
                                         '2.0 star rating':2,
                                         '1.0 star rating':1
                                        })

In [None]:
rating = pd.DataFrame(df.user_rating.value_counts().head())
rating.reset_index(level=0, inplace = True)
rating.columns=['rating', 'count']
rating

In [None]:
sns.distplot(df.user_rating, color = 'tomato')
plt.show()

- restaurant name

In [None]:
len(df.restaurant_name.unique())

In [None]:
restaurant_name = df.groupby(df['restaurant_name']).user_id.count()
restaurant_name = pd.DataFrame(restaurant_name)
restaurant_name = restaurant_name.reset_index()
restaurant_name = restaurant_name.rename(index=str, columns={"user_id": "count"})

In [None]:
top_10 = restaurant_name.sort_values(by=['count'], ascending=False).head(10)

In [None]:
f, ax = plt.subplots(figsize=(10, 10))
sns.barplot(x='count', y='restaurant_name', data=top_10, color="violet", ax = ax)
plt.show()

In [None]:
restaurant_name.sort_values(by=['count']).head(10)

- restaurant_rating

In [None]:
df.restaurant_rating.unique()

In [None]:
df.restaurant_rating = df.restaurant_rating.replace({'5.0 star rating':5,
                                                     '4.5 star rating':4.5,
                                                     
                                                     '4.0 star rating':4,
                                                     '3.5 star rating':3.5,
                                                     '3.0 star rating':3,
                                                     '2.5 star rating':2.5,
                                                     '2.0 star rating':2,
                                                     '1.5 star rating':1.5,
                                                     '1.0 star rating':1
                                                    })

- restaurant_price

In [None]:
df.restaurant_price.unique()

In [None]:
df.restaurant_price = df.restaurant_price.replace({'$':1,
                                                     '$$':2, 
                                                     '$$$':3,
                                                     '$$$$':4})

- restaurant_type

In [None]:
li = []
for i in range(len(df)):
    item = df.restaurant_type.loc[i].replace(',', '')
    item = item.replace('&', '')
    item = item.replace('(','')
    item = item.replace(')','')
    item = item.lower()
    item = ' '.join(list(set(item.split())))
    li.append(item)


In [None]:
df['restaurant_type'] = li

- data cleaning finish

In [None]:
df.restaurant_type[0]

In [None]:
df.to_csv('Hoboken_restaurants_reviews_cleaned.csv', index=False)

## Reshape Dataset

In [None]:
df = pd.read_csv('Hoboken_restaurants_reviews_cleaned.csv')
train_raw = df[['user_id','restaurant_name', 'user_rating']]

In [None]:
len(train_raw)

In [None]:
gb_df = pd.DataFrame(train_raw.groupby(['user_id', 'restaurant_name']).mean())

In [None]:
gb_df = gb_df.reset_index()

In [None]:
len(gb_df.user_id.unique())

In [None]:
gb_df.head()

In [None]:
len(gb_df)

In [None]:
gb_df.to_csv('groupby_dataset.csv', index=False)

In [None]:
train_df = gb_df.pivot(index = 'user_id', columns ='restaurant_name', values = 'user_rating').fillna(0)

In [None]:
len(train_df)

In [None]:
train_df.to_csv('recommendation_dataset.csv',index=True)

## III. Basic Methods:
- co-occurrence matrices
- collaborative filtering
- matrix decomposition

### III.1 co-occurrence Matrices

- Background:
This is the simpliest method I use to build the recommendation system.   
The underlying assumption is that the customers will be interested in the restaurants that the other customers who attended same restaurants with this customer have been.

- Algorithms/equations:  
    - Co-occurrence Matrix = T(A) * A  
    - recommender = Co-occurrence Matrix * u

- Advantages:
    - Simple
    - Build with dataset in any size

- Limitations:
    - only based on the past behavior
    - ignore the user's rating for restaurants

- Step 1: data preparation

In [None]:
df = pd.read_csv('recommendation_dataset.csv')

In [None]:
def get_dataset_co_occurence(df):
    #replace the rating with 1
    df = df.replace({5:1, 4:1, 3:1, 2:1})
    return df

In [None]:
co_occurence_dataset = get_dataset_co_occurence(df)

- Step 2: co-occurrence matrix

In [None]:
def get_co_occurrence_matrices(co_occurence_dataset):
    co_matrices =np.matrix(co_occurence_dataset.iloc[:,1:])
    co_matrices_t = co_matrices.getT()
    co_occurence_matrix = co_matrices_t * co_matrices
    np.fill_diagonal(co_occurence_matrix, 0)
    return co_occurence_matrix

In [None]:
co_occurence_matrix = get_co_occurrence_matrices(co_occurence_dataset)

- Step 3: recommender for each user

In [None]:
def get_co_occurence_result(co_occurence_dataset,co_occurence_matrix, user_idx=None, user_id = None, top_n=None):
    ##input is index
    if user_idx is not None:
        user = co_occurence_dataset.iloc[user_idx,1:]
    ##input is id
    if user_id is not None:
        user = co_occurence_dataset[co_occurence_dataset.user_id== user_id].iloc[0,1:]
    
    ##convert Series to array
    user_vector = np.array(user)
    
    ##get co occurence recommender
    recommender =  user_vector * co_occurence_matrix
    #matrix to array
    recommender = np.array(recommender).reshape(-1,)
    #array to list
    recommender = recommender.tolist()
    
    ##export the result list for one user
    #create result dataframe 
    user_result = pd.DataFrame(user)
    #append recommender list to this result dataframe
    user_result['recommender'] = recommender
    
    #check if the user have already attended this restaurant
    #only rank the restaurant they did not attend
    result_for_user = user_result[user_result.iloc[:, 0] != 1].sort_values(by = 'recommender', ascending = False)
    
    #create recommendation list with n recommendations for each user
    result_for_user = list(result_for_user.head(top_n).index.values)
    return result_for_user

In [None]:
def store_result(co_occurence_dataset, co_occurence_matrix):
    # use for loop to get each user's recommendation list in the dataset
    recommendation_li = []
    for i in range(len(co_occurence_dataset)):
        recommendation_for_user = get_co_occurence_result(co_occurence_dataset, co_occurence_matrix, user_id=co_occurence_dataset.user_id[i], top_n=3)
        recommendation_li.append(recommendation_for_user)
        
    print('get list!')
    # create export dataset
    df = pd.read_csv('recommendation_dataset.csv')
    # first column
    df = pd.DataFrame(df.user_id)
    # second column
    df['recommendation'] = recommendation_li
    # save
    df.to_csv('co_occurence_result.csv')
    return None

In [None]:
#%%time
#store_result(co_occurence_dataset, co_occurence_matrix)

### III.2 Collaborative Filtering
- Background: My second recommendation system use collaborative filtering. This method is tring to find people with similar interests, analyze those guys behaviors, and recommend user the same items. These are two basic approaches in Collaborative Filtering: user-based collaborative filtering and item-based collaborative filtering.
  
- Basically, all of those two methods contains two steps:
    - First Step: Find out how many users/items in the database are similar to the given user/item.  
    - Second Step: Assess other users/items to predict what grade you would give the user of this product, given the total weight of the users/items that are more similar to this one.
    

- Algorithms/Equations:
    - Similarity Calculation: 
        1. cosine similarity:
        ![](https://wikimedia.org/api/rest_v1/media/math/render/svg/a71c4add4abded66efd42b202c76f6a59944a587)
        2. Jaccard Similarity: 
        ![](https://wikimedia.org/api/rest_v1/media/math/render/svg/d54c3ac9fb70b8e3d76166589c880fc9df119970)
        3. Pearson Similarity:
        ![](https://i.stack.imgur.com/KaM0y.gif)
    - Recommend_items: simple **weighted arithmetic mean** according to the degree of similarity to fill empty cells in the table.
    
      
- Advantages:
    - Take the rating into account  
    
  
- Disadvantages:
    - Only focus on the privious behaviors

- Step 1: Data preparation

In [None]:
df = pd.read_csv('groupby_dataset.csv')
train_data, test_data = train_test_split(df, test_size=0.25)

train_data_gl = graphlab.SFrame(train_data)
test_data_gl = graphlab.SFrame(test_data)

- Step 2: calculate similarity 
    - Three Similarity
        1. cosine similarity; 
        2. Jaccard Similarity; 
        3. Pearson Similarity
    - Two approches
        1. user-item collaborative filtering
        2. item-item collaborative filtering

Before go forward to collaborative filtering, I might try popularity_recommender
    - Arg:
        - train_data: the SFrame which contains the required data
        - user_id: the column name which represents each user ID
        - item_id: the column name which represents each item to be recommended
        - target: the column name representing scores/ratings given by the user

In [None]:
popularity_model = graphlab.popularity_recommender.create(train_data_gl, 
                                                          user_id='user_id', 
                                                          item_id='restaurant_name', 
                                                          target='user_rating')

In [None]:
popularity_recomm = popularity_model.recommend(users=range(1,6),k=3)

In [None]:
popularity_recomm.print_rows(num_rows=28)

#### All the results are same. 

In [None]:
train_data.groupby(by='restaurant_name')['user_rating'].mean().sort_values(ascending=False).head(20)

#### Since all the recommended restaurant have an average rating of 5. So the popularity recommender is not accurate enough. So I will come back to collaborative filtering.

##### Cosine similarity

In [None]:
Cosine_model = graphlab.item_similarity_recommender.create(train_data_gl, user_id='user_id', 
                                                             item_id='restaurant_name', 
                                                             target='user_rating', 
                                                             similarity_type='cosine')

##### Jaccard Similarity

In [None]:
Jaccard_model = graphlab.item_similarity_recommender.create(train_data_gl, user_id='user_id', 
                                                             item_id='restaurant_name', 
                                                             target='user_rating', 
                                                             similarity_type='jaccard')

##### Pearson Similarity

In [None]:
Pearson_model = graphlab.item_similarity_recommender.create(train_data_gl, user_id='user_id', 
                                                             item_id='restaurant_name', 
                                                             target='user_rating', 
                                                             similarity_type='pearson')

- Step 3: Predict the rating and recommend top 3 rating restaurants

In [None]:
Cosine_recommendation = Cosine_model.recommend(k=3,verbose=False)

In [None]:
Cosine_recommendation.save('Cosine_recommendation_result.csv')

In [None]:
Jaccard_recommendation = Jaccard_model.recommend(k=3,verbose=False)
Jaccard_recommendation.print_rows(num_rows=30)

In [None]:
Pearson_recommendation = Pearson_model.recommend(k=3,verbose=False)
Pearson_recommendation.print_rows(num_rows=30)

- Step 4: Accuracy Evaluation
Recall:What ratio of items that a user likes were actually recommended.

Precision:Out of all the recommended items, how many the user actually liked?


In [None]:
model_performance = graphlab.compare(test_data_gl, [popularity_model, Cosine_model, Jaccard_model, Pearson_model])

#### Summary:

The cosine and jaccard model perform better than the others for this dataset. And cosine model is slightly better than jaccard model. So I will use cosine model to build the second recommendation system.

### III.3 Matrix Factorization via Singular Value Decomposition
- Background: The Matrix Factorization is the third method I used to build recommendation system. The assumption of matrix factorization is that each user have tendency to like different features of item. For example, in Hoboken restaurant dataset, the features might be theme and style of the restaurant, environment, service quality, food quality of restaurants and so on. Different user have different expectation and preferences for different features. In addition, each item, restaurant in this case, have features in different degree. We assume that the user will rate higher to a new restaurant with more features this user like and will rate lower to a new restaurant with less features this user like. 

- Algorithms/equation:
    - Basic Algorithm:
     ![](https://raw.githubusercontent.com/houlaizhexq/images/master/%E9%A2%84%E6%B5%8B%E5%BE%97%E5%88%86.jpg)
     ![](https://www.packtpub.com/sites/default/files/Article-Images/B01900_4.png)
            - Rui: user u's rating for item i
            - Qi:item i's similarity with a feature k
            - Pu:user u's preference for a feature k 
    - Algorithm with bias:
    ![](https://raw.githubusercontent.com/houlaizhexq/images/6ebc5f5aa3b6a31dfcb6091fbb16cf7e81a2ed8c/%E5%B8%A6%E5%81%8F%E5%B7%AE%E4%BF%AE%E6%AD%A3%E7%9A%84%E9%A2%84%E6%B5%8B.jpg)
    - Algorithm with historical feedback:
    ![](https://raw.githubusercontent.com/houlaizhexq/images/6ebc5f5aa3b6a31dfcb6091fbb16cf7e81a2ed8c/%E5%B8%A6%E6%9C%89%E5%8E%86%E5%8F%B2%E5%92%8C%E6%A0%87%E7%AD%BE%E7%9A%84%E8%AF%AF%E5%B7%AE.jpg)
    - Algorithm with change by time:
    ![](https://raw.githubusercontent.com/houlaizhexq/images/6ebc5f5aa3b6a31dfcb6091fbb16cf7e81a2ed8c/%E5%8A%A0%E5%85%A5%E6%97%B6%E9%97%B4%E5%9B%A0%E7%B4%A0%E5%90%8E%E7%9A%84%E4%BC%B0%E8%AE%A1%E8%AF%84%E5%88%86.jpg)
    

- Advantages:
    - Masive dataset
    - easy to add new parameter(bias, historical feedback, time changes and so on)

- Disadvantages:
    - Computation resources


- Step 1: Data Preparationg

In [None]:
train_set = pd.read_csv('recommendation_dataset.csv')

In [None]:
train_set.head()

In [None]:
matrices =np.matrix(train_set.iloc[:,1:])

In [None]:
matrices

- Step 2: Extract features k  
    I could simply extract features k with user_id, restaurant_name, rating or try more complicative method incorporating NLP. 
    - Rating only
    - Reveiw text based

#### Rating Only
reduce the dimension of original matrices and got two new matrices.
- new matrix 1: represent user i's preference of different features k
- new matrix 2: represent the item j's similarity to features k

In [None]:
k_model = NMF(n_components=5, init='random', random_state=0)

In [None]:
W = k_model.fit_transform(matrices)
H = k_model.components_

- Step 3: Recommender

In [None]:
nR = np.matrix(np.dot(W,H))

In [None]:
nR.shape

In [None]:
result_df = pd.DataFrame(nR)

In [None]:
result_df.columns = train_set.columns[1:]

In [None]:
result_df.insert(0, 'user_id', train_set.iloc[:,0])

In [None]:
result_df.head()

In [None]:
train_set.head()

- Step 4: Export the Result

In [None]:
recommendation_top_3 = list(result_df.iloc[3,1:].sort_values(ascending=False).head(3).index)

In [None]:
recommendation_top_3

In [None]:
recommendation_li = []
for i in range(len(result_df)):
    attended_res_li = None
    try:
        attended_res_li = list(train_set.iloc[i,1:][train_set.iloc[i,1:] != 0].index)
    except:
        pass
    if attended_res_li is not None:
        for res in attended_res_li:
            result_df.loc[i, res] = 0

    recommendation_top_3 = list(result_df.iloc[i,1:].sort_values(ascending=False).head(3).index)
    recommendation_li.append(recommendation_top_3)

In [None]:
recommendation_li[0:10]

In [None]:
export_df = pd.DataFrame(train_set.user_id)
export_df.head()

In [None]:
export_df['Recommendation'] = recommendation_li

In [None]:
export_df.head()

In [None]:
export_df.to_csv('matrix_factorization_result.csv')

## IV. Deep Learning Methods:
I have already build three recommendation systems with popular recommendation algorithms. Now, I will try to use Neural Networks to build a advanced recommendation systems

Objective: Top 3 scores restaurants recommendation

Output:
    - estimated rating for each of 302 restaurants
Independent variables:
    - user_rating
    - restaurant rating
    - restaurant price
    - restaurant type
    - reviews

In [None]:
from sklearn.preprocessing import Normalizer

### Data Preparation

In [None]:
df = pd.read_csv('Hoboken_restaurants_reviews_cleaned.csv')
df.head()

In [None]:
nn_df = df[['user_id','restaurant_name','user_rating','restaurant_rating','restaurant_price','restaurant_type']]

In [None]:
nn_df = nn_df.dropna()

In [None]:
nn_df.head()

#### Normalizer

In [None]:
def normalize_variable(nn_df,variable_names):
    variables_li = []
    for variable in variable_names:
        variables_li.append(nn_df[variable])
    encoder = Normalizer()
    to_array = np.asarray(variables_li)
    norm_result = encoder.fit_transform(to_array)
    norm_result = norm_result.T
    return norm_result

#### NLP- tf-idf

In [None]:
stemmer = SnowballStemmer("english")
tokenizer = RegexpTokenizer("[a-z']+")

def tokenize(text):
    tokens = tokenizer.tokenize(text)
    return [stemmer.stem(t) for t in tokens] 

def get_tf(data, idf, max_df=1.0, min_df=1, ngram_range=(1,1)):
    if idf:
        """Convert a collection of raw documents to a matrix of TF-IDF features."""
        m = TfidfVectorizer(max_df=max_df, min_df=min_df, stop_words='english', ngram_range=ngram_range, tokenizer=tokenize,lowercase=True)
    d = m.fit_transform(data)
    return m, d

In [None]:
def get_nn_df(nn_df, norm_result, tfidf_d, variable_names):
    norm_result = pd.DataFrame(norm_result)
    tfidf_d = pd.DataFrame(tfidf_d.toarray())
    for index, name in enumerate(variable_names):
        tfidf_d[name] = norm_result.iloc[:,index]
    tfidf_d.index = nn_df.user_id
    tfidf_d['output'] = list(nn_df.restaurant_name)
    return tfidf_d

### Deep Learning - NN
    - NN  @ sklearn
    - NN @ tensorflow

Input:
 - Customer's attended restaurants's tag

Output:
 - probability of the restaurants they will be interested with similary tag.
 

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
def get_recommendation_df(model, X_test):
    probability = model.predict_proba(X_test)
    recommendation_df = pd.DataFrame({'restaurant_name':model.classes_,'probability':probability[0]})
    return recommendation_df

In [None]:
def get_output(top_n, y_test,recommendation_df):
    top_n = recommendation_df[recommendation_df.restaurant_name != y_test].sort_values(by='probability', ascending=False).head(top_n)
    recommendation_li = []
    for i in range(len(top_n)):
        recommendation_item = list(top_n.iloc[i,:])
        recommendation_li.append(recommendation_item)
    output = str(recommendation_li).replace('[','')
    output = output.replace(']','')
    return output

In [None]:
variable_names = ['user_rating', 'restaurant_rating', 'restaurant_price']

norm_result = normalize_variable(nn_df,variable_names)

tfidf_m, tfidf_d = get_tf(nn_df['restaurant_type'], idf=True, max_df=0.5, min_df=10)

df = get_nn_df(nn_df, norm_result, tfidf_d, variable_names)

X_train = df.iloc[:,:-1]
y_train = df.output

In [None]:
df.to_csv('normalized_neural_network_dataset.csv')

In [None]:
nn= MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(16, 3), random_state=1)

In [None]:
%%time
nn = nn.fit(X_train, y_train)

In [None]:
top_n = 3
X_test = np.asarray(X_train.iloc[2150,:])
X_test = X_test.reshape(-1,1).T
y_test = y_train.iloc[2150]
recommendation_df = get_recommendation_df(nn, X_test)
output = get_output(top_n,y_test,recommendation_df)
output

In [None]:
len(X_train)

In [None]:
nn_df.head()

In [None]:
export_df = pd.DataFrame(nn_df.user_id)
export_df.head()

In [None]:
top_n = 3

In [None]:
def store_the_recommendation(X_train, y_train,nn_df, top_n, nn):
    export_df = pd.DataFrame(nn_df.user_id)
    recommendation_li = []
    for i in range(len(X_train)):
        X_test = np.asarray(X_train.iloc[i, :])
        X_test = X_test.reshape(-1, 1).T
        y_test = y_train.iloc[i]
        recommendation_df = get_recommendation_df(nn, X_test)
        output = get_output(top_n, y_test, recommendation_df)
        recommendation_li.append(output)
    export_df['Recommendation'] = recommendation_li
    export_df.to_csv('Neural_Network_result.csv', index=False)
    return None

In [None]:
store_the_recommendation(X_train, y_train ,nn_df, top_n, nn)

In [None]:
df = pd.read_csv('Neural_Network_result.csv')

In [None]:
df.head()

In [None]:
df.to_csv('Neural_Network_result.csv', index=False)

# Evaluation the Recommendation System
There are multiple elements to evaluate and elevate the performance of recommendation system. However, I could only test the accuracy with the limitation of this project.

- Customer Satisfaction: click possibility, duration time, Conversion rate

- accuracy: MSE, Return rate, accuracy

- covering rate: cross entropy, Gini Coefficient

- diversity: features similarity

- creative/innovative: new items, the itmes did not puchase

- surprice: at first glance, it seems irrelevant, but provide user surprice in a positive way.

- confidence: interactive

- real time updated and cold start

- fraud detection

- profit