In [2]:
from collections.abc import Sequence
from sklearn import preprocessing
import sklearn.feature_extraction.text as sk_text
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.keras 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from scipy.stats import zscore
import pandas as pd
import json
import csv
import io
import requests
import shutil
import os
import numpy as np
from sklearn import metrics
path = "./yelp_dataset/"



In [3]:
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)



# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low




In [None]:
outfile = open("review_stars.tsv", 'w')
sfile = csv.writer(outfile, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','stars','text'])
filename_read = os.path.join(path,"yelp_academic_dataset_review.json")
lineCount = 0

with open(filename_read, encoding="utf-8") as f:
    for line in f:
        row = json.loads(line)
        # some special char must be encoded in 'utf-8'
        sfile.writerow([row['business_id'],row['stars'],(row['text']).encode('utf-8')])
        #lineCount += 1
        #if lineCount >= 1000:
         #   break;
        
outfile.close()

df_review=pd.read_csv('review_stars.tsv',delimiter="\t",encoding="utf=8")

df_review[0:5]

In [None]:
outfile = open("business_stars.tsv", 'w')
sfile = csv.writer(outfile, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','stars','review_count'])
filename_read = os.path.join(path,"yelp_academic_dataset_business.json")
lineCount = 0

with open(filename_read, encoding="utf-8") as f:
    for line in f:
        row = json.loads(line)
        # some special char must be encoded in 'utf-8'
        sfile.writerow([row['business_id'],row['stars'],row['review_count']])
        #lineCount += 1
        #if lineCount >= 1000:
         #   break;
        
outfile.close()

df_business=pd.read_csv('business_stars.tsv',delimiter="\t",encoding="utf=8")

df_business[0:5]

In [None]:
df=pd.merge(df_business, df_review, on='business_id')

df = df[df['review_count'] >= 20]

df[0:5]

In [None]:
df['stars_x'] = zscore(df['stars_x'])
df['stars_y'] = zscore(df['stars_y'])

df.drop('review_count', axis=1, inplace=True)

df[0:5]

In [None]:
df_review_agg = df.groupby('business_id')['text'].sum()
df_ready_for_sklearn=pd.DataFrame({'business_id': df_review_agg.index, 'all_reviews': df_review_agg.values})

In [None]:
vectorizer = sk_text.TfidfVectorizer(max_features=5000, min_df=2, max_df=40000)
matrix = vectorizer.fit_transform(df_ready_for_sklearn.all_reviews)

In [None]:
tfidf_data = matrix.toarray()

In [None]:
tfidf_data.shape

In [None]:
y = zscore(df_business[df_business['review_count'] >= 20].stars)

In [None]:
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(tfidf_data, df_y.stars, test_size=0.3)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint

model = Sequential()

model.add(Dense(100, input_dim=tfidf_data.shape[1], activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(25, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer='adam')

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')

model.fit(x_train, y_train, validation_data=(x_test,y_test), callbacks=[monitor], verbose=2, epochs=1000)  


In [None]:
# Predict
pred = model.predict(x_test)

# Measure MSE error.  
MSEscore = metrics.mean_squared_error(pred,y_test)
print("Final score (MSE): {}".format(MSEscore))
      
# Measure RMSE error.  RMSE is common for regression.
RMSEscore = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Final score (RMSE): {}".format(RMSEscore))