## 1. Import Modules

In [0]:
!pip install xgboost

In [0]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GroupShuffleSplit
import xgboost as xgb
import plotly.graph_objects as go
import plotly.express as px
from sklearn.metrics import ndcg_score

## 2. Read Files

In [0]:
# accessing cleaned file
bizDF = (spark.read
  .option("inferSchema", True)
  .json("/FileStore/tables/Project_Data_Extract/business_filtered.json"))

bizDF.display()

address,business_id,business_perf,categories,city,cluster,latitude,longitude,name,postal_code,review_ave_stars,review_count,review_ss,state,tips_count,tips_ss
3164 NW 185th Ave,Agq4zoNLSIpT1_ZJbnrvww,Good,"Food, Donuts",portland,1,45.5429252,-122.8662879,Donut Palace,97229,4.285714285714286,173,0.8146031746031747,OR,13,0.4414153846153846
187 Elm St,bxy3khT-2R66tcdKjFa2pw,Good,"Restaurants, Seafood, Italian, Pizza",somerville,1,42.3930843,-71.1202673,Posto,02144,4.1866666666666665,910,0.8298826666666667,MA,90,0.3474077777777777
"12251 Number 1 Road, Suite 140",3KqpiLDAjeeMmZeU-Il_ng,Poor,"Japanese, Restaurants, Sushi Bars",richmond,2,49.1245685,-123.1815968,Yokohama Teppanyaki,V7E 1T6,2.083333333333333,80,0.2791583333333333,BC,8,0.3193
1487 Dorchester Ave,TYDCjEMga3cm7k638hKhTg,Good,"Chinese, Vietnamese, Restaurants",dorchester,1,42.2999809,-71.0604396,Hien Vuong Restaurant,02122,4.142857142857143,36,0.7781142857142858,MA,9,0.2989666666666667
6815 E Broad St,KpdYtK2KtLxEzYNW38XKZQ,Poor,"Restaurants, Fast Food, Mexican",columbus,2,39.9806544587,-82.821739,Chipotle Mexican Grill,43213,1.7857142857142858,43,0.0911285714285714,OH,12,-0.0142083333333333
"730 W Sand Lake Rd, Ste 30",f3teByaeIKPTYetAawUtxg,Good,"Restaurants, Breakfast & Brunch, Cuban, Latin American",orlando,1,28.4500355797,-81.3879469963,Café Pinar,32809,4.580645161290323,167,0.7474806451612903,FL,25,0.528368
1410 State Rd 436,GrLNk_EE831QDAhaSAZpWA,Good,"Latin American, Cocktail Bars, Restaurants, Colombian, Gastropubs, Nightlife, Bars",casselberry,1,28.627566,-81.316263,Capachos Bar & Grill,32707,3.888888888888889,17,0.8434055555555555,FL,4,0.316575
4315 W Lake Mary Blvd,Hhc8Ix8sygBjqFbiO1c1TQ,Average,"Mexican, Fast Food, Restaurants",lake mary,0,28.7556732126,-81.350758338,Chipotle Mexican Grill,32746,2.75,60,0.2922875,FL,28,0.3526178571428571
197 Massachusetts Ave,zeqEqgEmTY3c7HkJmHrw2g,Average,"Italian, Sandwiches, Fast Food, Pizza, Restaurants",lexington,0,42.4261588,-71.196419,Nick's Place,02420,3.1666666666666665,89,0.5297499999999999,MA,10,0.46006
"520 SW 6th Ave, Ste 105",O_BAT_rvszHYBNEM6z0tcQ,Poor,"Sandwiches, Desserts, Food, Salad, Fast Food, Restaurants",portland,2,45.5199352739,-122.677917119,Subway,97204,2.0,20,-0.3145200000000001,OR,6,0.33155


In [0]:
# accessing cleaned file
aspectDF = (spark.read
  .option("inferSchema", True)
  .json("/FileStore/tables/Project_Data_Extract/aspect.json"))

aspectDF.display()

aspect_category,ave_stars,business_id,city,count,review_ss_x,state,tips_count,tips_ss
AMBIENCE#GENERAL,5.0,--UNNdnHRhsyFUbDgumdtQ,portland,3,0.8706666666666667,OR,126,0.3122190476190477
DRINKS#QUALITY,5.0,--UNNdnHRhsyFUbDgumdtQ,portland,1,0.6597,OR,126,0.3122190476190477
FOOD#QUALITY,4.288461538461538,--UNNdnHRhsyFUbDgumdtQ,portland,104,0.8278605769230769,OR,126,0.3122190476190477
RESTAURANT#GENERAL,3.0,--UNNdnHRhsyFUbDgumdtQ,portland,2,0.307,OR,126,0.3122190476190477
SERVICE#GENERAL,4.666666666666667,--UNNdnHRhsyFUbDgumdtQ,portland,6,0.5644666666666667,OR,126,0.3122190476190477
FOOD#QUALITY,4.3125,--bbZa1KPYSmW0X4o3TUQw,vancouver,16,0.95283125,BC,3,0.5765666666666666
AMBIENCE#GENERAL,4.75,--hkbIWgBKBOZq4VcNwdhQ,everett,4,0.767975,MA,38,0.4749710526315788
FOOD#QUALITY,4.038461538461538,--hkbIWgBKBOZq4VcNwdhQ,everett,52,0.8344673076923078,MA,38,0.4749710526315788
RESTAURANT#GENERAL,5.0,--hkbIWgBKBOZq4VcNwdhQ,everett,1,0.7902,MA,38,0.4749710526315788
SERVICE#GENERAL,3.4,--hkbIWgBKBOZq4VcNwdhQ,everett,5,0.3452199999999999,MA,38,0.4749710526315788


In [0]:
aspect_pdf=aspectDF.toPandas()
biz_pdf=bizDF.toPandas()

## 3. Merge Files

In [0]:
 aspect_GA=aspect_pdf.loc[aspect_pdf['state'] == "GA" ]

In [0]:
aspect_GA

Unnamed: 0,aspect_category,ave_stars,business_id,city,count,review_ss_x,state,tips_count,tips_ss
15,FOOD#QUALITY,4.000000,-0JXXCMUpe29dQ-f6pG9OA,atlanta,1,0.883400,GA,4,0.609325
16,SERVICE#GENERAL,5.000000,-0JXXCMUpe29dQ-f6pG9OA,atlanta,2,0.937700,GA,4,0.609325
17,FOOD#QUALITY,4.500000,-0OWS89ebRdvzOQkjptyEw,atlanta,2,0.955100,GA,3,0.467833
18,SERVICE#GENERAL,5.000000,-0OWS89ebRdvzOQkjptyEw,atlanta,1,0.945900,GA,3,0.467833
25,AMBIENCE#GENERAL,3.833333,-0d5juVzvVE5Ln86QGt5iA,atlanta,6,0.661050,GA,5,0.545600
...,...,...,...,...,...,...,...,...,...
88989,RESTAURANT#GENERAL,1.000000,zzin1d1oHi81GuI0ufo1VA,atlanta,1,0.357000,GA,98,0.359910
88990,SERVICE#GENERAL,2.500000,zzin1d1oHi81GuI0ufo1VA,atlanta,2,0.406500,GA,98,0.359910
88991,FOOD#QUALITY,3.777778,zzlkjDG9Rv8Jn-vSolMgyw,atlanta,18,0.810039,GA,16,0.436350
88992,RESTAURANT#GENERAL,4.000000,zzlkjDG9Rv8Jn-vSolMgyw,atlanta,2,0.837950,GA,16,0.436350


In [0]:
aspect_GA = aspect_GA.merge(biz_pdf[["business_id","name"]])

In [0]:
aspect_GA["aspect_category"].value_counts()

Out[31]: FOOD#QUALITY                3713
SERVICE#GENERAL             3151
RESTAURANT#GENERAL          1783
AMBIENCE#GENERAL            1560
RESTAURANT#PRICES            183
DRINKS#QUALITY                69
FOOD#PRICES                   46
RESTAURANT#MISCELLANEOUS      46
FOOD#STYLE_OPTIONS            44
LOCATION#GENERAL              44
DRINKS#STYLE_OPTIONS           5
Name: aspect_category, dtype: int64

In [0]:
fig = px.histogram(aspect_GA, x="aspect_category",height=1000,width=2500)
fig.update_layout(yaxis = dict(tickfont = dict(size=20)))
fig.update_layout(xaxis = dict(tickfont = dict(size=20)))
fig.update_layout(xaxis = dict(titlefont = dict(size=30)))
fig.update_layout(yaxis = dict(titlefont = dict(size=30)))
fig.show()

## 4. Assign Relevance Score

In [0]:
aspect_cat=["FOOD#QUALITY","SERVICE#GENERAL","RESTAURANT#GENERAL","AMBIENCE#GENERAL","RESTAURANT#PRICES","DRINKS#QUALITY","FOOD#PRICES","LOCATION#GENERAL","FOOD#STYLE_OPTIONS","RESTAURANT#MISCELLANEOUS","DRINKS#STYLE_OPTIONS"]

In [0]:
appended_data = []
for i in range(len(aspect_cat)):
    df_iloc=aspect_GA.loc[aspect_GA['aspect_category'] == str(aspect_cat[i]) ]
    df_iloc["Rank"] = df_iloc[["ave_stars", "review_ss_x","count","tips_count","tips_ss"]].apply(tuple, axis = 1).rank(method = 'dense', ascending = False).astype(int)
    appended_data.append(df_iloc)

aspect_GA = pd.concat(appended_data)
    



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [0]:
aspect_service=aspect_GA.loc[aspect_GA['aspect_category'] == "SERVICE#GENERAL"]   
bins = np.arange(0, 3180, 20)
labels = np.arange(158, 0, -1)
aspect_service['Relevance'] = pd.cut(aspect_GA['Rank'], bins=bins, labels=labels)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [0]:
aspect_food_qual=aspect_GA.loc[aspect_GA['aspect_category'] == "FOOD#QUALITY"]   
bins = np.arange(0, 3740, 20)
labels = np.arange(186, 0, -1)
aspect_food_qual['Relevance'] = pd.cut(aspect_GA['Rank'], bins=bins, labels=labels)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [0]:
aspect_ambience=aspect_GA.loc[aspect_GA['aspect_category'] == "AMBIENCE#GENERAL"]   
bins = np.arange(0, 1580, 20)
labels = np.arange(78, 0, -1)
aspect_ambience['Relevance'] = pd.cut(aspect_GA['Rank'], bins=bins, labels=labels)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [0]:
aspect_restraunt_price=aspect_GA.loc[aspect_GA['aspect_category'] == "RESTAURANT#PRICES"]   
bins = np.arange(0, 190, 5)
labels = np.arange(37, 0, -1)
aspect_restraunt_price['Relevance'] = pd.cut(aspect_GA['Rank'], bins=bins, labels=labels)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [0]:
aspect_drinks_qual=aspect_GA.loc[aspect_GA['aspect_category'] == "DRINKS#QUALITY"]   
bins = np.arange(0, 75, 5)
labels = np.arange(14, 0, -1)
aspect_drinks_qual['Relevance'] = pd.cut(aspect_GA['Rank'], bins=bins, labels=labels)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [0]:
aspect_food_price=aspect_GA.loc[aspect_GA['aspect_category'] == "FOOD#PRICES"]   
bins = np.arange(0, 55, 5)
labels = np.arange(10, 0, -1)
aspect_food_price['Relevance'] = pd.cut(aspect_GA['Rank'], bins=bins, labels=labels)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [0]:
aspect_location=aspect_GA.loc[aspect_GA['aspect_category'] == "LOCATION#GENERAL"]   
bins = np.arange(0, 50, 5)
labels = np.arange(9, 0, -1)
aspect_location['Relevance'] = pd.cut(aspect_GA['Rank'], bins=bins, labels=labels)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [0]:
aspect_food_style=aspect_GA.loc[aspect_GA['aspect_category'] == "FOOD#STYLE_OPTIONS"]   
bins = np.arange(0, 50, 5)
labels = np.arange(9, 0, -1)
aspect_food_style['Relevance'] = pd.cut(aspect_GA['Rank'], bins=bins, labels=labels)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [0]:
frames = [aspect_food_style, aspect_location, aspect_food_price,aspect_drinks_qual,aspect_restraunt_price,aspect_ambience,aspect_food_qual,aspect_service]
aspect_GA = pd.concat(frames)

## 5. Prepare Training Data

In [0]:
le = preprocessing.LabelEncoder()
cat=le.fit_transform(aspect_GA["aspect_category"])

In [0]:
aspect_GA["aspect_category"]=cat

In [0]:
train_GA=aspect_GA.copy()

In [0]:
train_GA=train_GA.drop(columns=["city","state","Rank","business_id","name"])

In [0]:
scaler = MinMaxScaler()
train_GA[["ave_stars","count","review_ss_x","tips_count","tips_ss"]] = scaler.fit_transform(
    train_GA[["ave_stars","count","review_ss_x","tips_count","tips_ss"]])

In [0]:
gss = GroupShuffleSplit(test_size=.2, n_splits=1, random_state = 7).split(train_GA, groups=train_GA['aspect_category'])

X_train_inds, X_test_inds = next(gss)

train_data= train_GA.iloc[X_train_inds]
X_train = train_data.loc[:, ~train_data.columns.isin(['aspect_category','Relevance'])]
y_train = train_data.loc[:, train_data.columns.isin(['Relevance'])]

groups = train_data.groupby('aspect_category').size().to_frame('size')['size'].to_numpy()

test_data= train_GA.iloc[X_test_inds]

#We need to keep the id for later predictions
X_test = test_data.loc[:, ~test_data.columns.isin(['Relevance'])]
y_test = test_data.loc[:, test_data.columns.isin(['Relevance',"aspect_category"])]

## 6. Build Model

In [0]:
model = xgb.XGBRanker(  
    tree_method='hist',
    booster='gbtree',
    objective='rank:pairwise',
    random_state=42, 
    learning_rate=0.1,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=110, 
    subsample=0.75,
    )

model.fit(X_train, y_train, group=groups, verbose=True)

Out[51]: XGBRanker(base_score=0.5, booster='gbtree', colsample_bylevel=1,
          colsample_bynode=1, colsample_bytree=0.9, enable_categorical=False,
          eta=0.05, gamma=0, gpu_id=-1, importance_type=None,
          interaction_constraints='', learning_rate=0.1, max_delta_step=0,
          max_depth=6, min_child_weight=1, missing=nan,
          monotone_constraints='()', n_estimators=110, n_jobs=2,
          num_parallel_tree=1, predictor='auto', random_state=42, reg_alpha=0,
          reg_lambda=1, scale_pos_weight=None, subsample=0.75,
          tree_method='hist', validate_parameters=1, verbosity=None)

## 7. Test Model

In [0]:
X_test["aspect_category"].value_counts()

Out[52]: 2    46
5    44
Name: aspect_category, dtype: int64

In [0]:
X_test_5=X_test.loc[X_test['aspect_category'] == 5] 
y_test_5=y_test.loc[y_test['aspect_category'] == 5] 
del y_test_5["aspect_category"]
predict_5=model.predict(X_test_5.loc[:, ~X_test_5.columns.isin(['aspect_category'])])
y_test_5["Rank"]=predict_5
y_test_5=y_test_5.sort_values(by='Rank', ascending=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [0]:
X_test_2=X_test.loc[X_test['aspect_category'] == 2] 
y_test_2=y_test.loc[y_test['aspect_category'] == 2] 
del y_test_2["aspect_category"]
predict_2=model.predict(X_test_2.loc[:, ~X_test_2.columns.isin(['aspect_category'])])
y_test_2["Rank"]=predict_2
y_test_2=y_test_2.sort_values(by='Rank', ascending=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [0]:
train_ndcg_score=(ndcg_score([y_test_2["Relevance"]], [y_test_2["Rank"]])+ndcg_score([y_test_5["Relevance"]], [y_test_5["Rank"]]))/2
print("Average NDCG Score:",round(train_ndcg_score,3))

Average NDCG Score: 0.997


## 8. Apply Model on Dataset

In [0]:
aspect_GA_pred=aspect_GA.copy()

In [0]:
aspect_GA_pred.drop(columns=["city","state","Rank","business_id","name"],inplace=True)

In [0]:
aspect_GA_pred[["ave_stars","review_ss_x","count","tips_count","tips_ss"]] = scaler.fit_transform(
    aspect_GA_pred[["ave_stars","review_ss_x","count","tips_count","tips_ss"]])

In [0]:
del aspect_GA_pred["Relevance"]

In [0]:
predict_overall=model.predict(aspect_GA_pred.loc[:, ~aspect_GA_pred.columns.isin(['aspect_category'])])

In [0]:
aspect_GA["Score"]=predict_overall

In [0]:
og_rank=aspect_GA["Rank"]
del aspect_GA["Rank"]

In [0]:
cat_list=[0,1,2,3,4,5,6,7]
rank_order=[]

In [0]:
for i in range(len(cat_list)):
    df_iloc=aspect_GA.loc[aspect_GA['aspect_category'] == i]   
    df_iloc=df_iloc.sort_values(by='Score', ascending=False)
    rank=np.arange(start=1, stop=len(df_iloc)+1, step=1)
    df_iloc["Rank"]=rank
    rank_order.append(df_iloc)
    
    
aspect_GA=pd.concat(rank_order)

In [0]:
aspect_cat=le.inverse_transform(aspect_GA["aspect_category"])
aspect_GA["aspect_category"]=aspect_cat

## 9. Visualize Results

In [0]:
aspect_GA_amb_general= aspect_GA[aspect_GA.aspect_category=="AMBIENCE#GENERAL"]

In [0]:
aspect_GA_amb_general.head()

Unnamed: 0,aspect_category,ave_stars,business_id,city,count,review_ss_x,state,tips_count,tips_ss,name,Relevance,Score,Rank
1487,AMBIENCE#GENERAL,5.0,7YaMaY9e8hjzjoKKKclkcw,atlanta,1,0.9766,GA,4,0.47295,gusto! West Midtown,78,4.842455,1
6290,AMBIENCE#GENERAL,5.0,_cwaiSi8V_N9YvMeOxK9Cw,atlanta,2,0.96705,GA,4,0.276425,Curry Up Now,77,4.686129,2
6854,AMBIENCE#GENERAL,5.0,cZmnKz8AnTVoDLTGbpK_yQ,atlanta,1,0.9669,GA,5,0.45756,Bully Boy,77,4.658078,3
2711,AMBIENCE#GENERAL,5.0,E_I8OEQbfgItgZp7qLnlDw,atlanta,2,0.972,GA,9,0.474889,101 Steak,78,4.639963,4
5905,AMBIENCE#GENERAL,5.0,Yf9MG1ARz1lZGbKwBbAvCg,atlanta,1,0.969,GA,5,0.0671,Wendy's,77,4.627118,5


In [0]:
fig = px.scatter(aspect_GA_amb_general, x="Relevance", y="Score", color='Rank')
fig.show()

In [0]:
fig = go.Figure(data=[go.Table(
    header=dict(values=("Rank","Business"),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[aspect_GA_amb_general.Rank, aspect_GA_amb_general.name],
               fill_color='lavender',
               align='left'))
])

fig.show()

In [0]:
aspect_GA_service_general= aspect_GA[aspect_GA.aspect_category=="SERVICE#GENERAL"]
aspect_GA_service_general

Unnamed: 0,aspect_category,ave_stars,business_id,city,count,review_ss_x,state,tips_count,tips_ss,name,Relevance,Score,Rank
3547,SERVICE#GENERAL,5.0,JlxG12JdeO_u7dZBoDGQJA,marietta,1,0.9702,GA,5,0.512060,Madras Mantra,157,4.772116,1
3611,SERVICE#GENERAL,5.0,K6uyS3Ck0U6X1yDNqIzejw,atlanta,1,0.9709,GA,4,0.000000,Lakewood Diner,157,4.704263,2
9060,SERVICE#GENERAL,5.0,qTXYie0Phy8JbuOau4rTmg,atlanta,2,0.9694,GA,8,0.552963,Aziza,157,4.682550,3
5268,SERVICE#GENERAL,5.0,UmNWvb8wx8CMlnjwcyZJ8Q,atlanta,1,0.9733,GA,5,0.043740,Kale Me Crazy,158,4.673647,4
4185,SERVICE#GENERAL,5.0,NQoLb6c4eYKRhB96p09GMw,atlanta,1,0.9726,GA,5,0.242720,Poke Nooke,158,4.645959,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7249,SERVICE#GENERAL,1.0,extguRN5qTc49-FNO2FRfQ,atlanta,1,-0.8074,GA,8,0.060112,IT'SUGAR,3,-5.795067,3147
775,SERVICE#GENERAL,1.0,3p6lfPuR--SAFi2jgvriJw,hapeville,1,-0.9603,GA,7,0.269571,McDonald's,1,-5.818604,3148
165,SERVICE#GENERAL,1.0,033rWnETcwxjuANzzK4CsA,dunwoody,2,-0.8113,GA,7,0.289243,Walgreens,3,-5.833671,3149
8107,SERVICE#GENERAL,1.0,kSmLeCUSzvoS3a3OOKwP8Q,atlanta,1,-0.7845,GA,7,0.291529,Le Bon Temps,3,-5.851244,3150


In [0]:
fig = go.Figure(data=[go.Table(
    header=dict(values=("Rank","Business"),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[aspect_GA_service_general.Rank, aspect_GA_service_general.name],
               fill_color='lavender',
               align='left'))
])

fig.show()

In [0]:
fig = px.scatter(aspect_GA_service_general, x="Relevance", y="Score", color='Rank')
fig.show()

In [0]:
aspect_GA_location_general= aspect_GA[aspect_GA.aspect_category=="LOCATION#GENERAL"]


In [0]:
fig = go.Figure(data=[go.Table(
    header=dict(values=("Rank","Business"),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[aspect_GA_location_general.Rank, aspect_GA_location_general.name],
               fill_color='lavender',
               align='left'))
])

fig.show()

In [0]:
fig = px.scatter(aspect_GA_location_general, x="Relevance", y="Score", color='Rank')
fig.show()

In [0]:
aspect_GA_food_qual= aspect_GA[aspect_GA.aspect_category=="FOOD#QUALITY"]

In [0]:
fig = go.Figure(data=[go.Table(
    header=dict(values=("Rank","Business"),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[aspect_GA_food_qual.Rank, aspect_GA_food_qual.name],
               fill_color='lavender',
               align='left'))
])

fig.show()

In [0]:
fig = px.scatter(aspect_GA_food_qual, x="Relevance", y="Score", color='Rank')
fig.show()

In [0]:
aspect_GA_food_price= aspect_GA[aspect_GA.aspect_category=="RESTAURANT#PRICES"]

fig = go.Figure(data=[go.Table(
    header=dict(values=("Rank","Business"),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[aspect_GA_food_price.Rank, aspect_GA_food_price.name],
               fill_color='lavender',
               align='left'))
])

fig.show()

In [0]:
fig = px.scatter(aspect_GA_food_price, x="Relevance", y="Score", color='Rank')
fig.show()

In [0]:
aspect_GA_drink_qual= aspect_GA[aspect_GA.aspect_category=="DRINKS#QUALITY"]

fig = go.Figure(data=[go.Table(
    header=dict(values=("Rank","Business"),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[aspect_GA_drink_qual.Rank, aspect_GA_drink_qual.name],
               fill_color='lavender',
               align='left'))
])

fig.show()

In [0]:
fig = px.scatter(aspect_GA_drink_qual, x="Relevance", y="Score", color='Rank')
fig.show()

In [0]:
aspect_GA_food_style= aspect_GA[aspect_GA.aspect_category=="FOOD#STYLE_OPTIONS"]

fig = go.Figure(data=[go.Table(
    header=dict(values=("Rank","Business"),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[aspect_GA_food_style.Rank, aspect_GA_food_style.name],
               fill_color='lavender',
               align='left'))
])

fig.show()

In [0]:
fig = px.scatter(aspect_GA_food_style, x="Relevance", y="Score", color='Rank')
fig.show()

In [0]:
ra_sushi = aspect_GA[aspect_GA.business_id=="s7baMoiG1821_3NblCfK6w"]
business_2 = aspect_GA[aspect_GA.business_id=='hmrRb7qX3K705MuxHHfgNA']
business_3 = aspect_GA[aspect_GA.business_id=='B_6V-u97NU8aai4mrKqwNw']
business_4 = aspect_GA[aspect_GA.business_id=='kD1mjIbvczeXBxDSluJaOw']
business_5 = aspect_GA[aspect_GA.business_id== '6Rq3Bcs969L60aoMGIs5FQ']
business_6 = aspect_GA[aspect_GA.business_id=='7Gub6fxNR1kkPp2MKm2ikw']
business_7 = aspect_GA[aspect_GA.business_id=='MeQegzmVVKNuu-ojlszjRQ']
business_8 = aspect_GA[aspect_GA.business_id=='_vocxUlPxxOafG18HE_B-A']
business_9 = aspect_GA[aspect_GA.business_id=='CcIS8RDNCtJUbxCduIEyAg']

In [0]:
all_business = pd.concat([ra_sushi, business_2,business_3,business_4,business_5,business_6,business_7,business_8,business_9])

In [0]:
all_business

Unnamed: 0,aspect_category,ave_stars,business_id,city,count,review_ss_x,state,tips_count,tips_ss,name,Relevance,Score,Rank
9297,AMBIENCE#GENERAL,3.0,s7baMoiG1821_3NblCfK6w,atlanta,2,0.8596,GA,147,0.430986,RA Sushi Bar Restaurant,9,-1.057057,1394
9298,FOOD#QUALITY,2.355556,s7baMoiG1821_3NblCfK6w,atlanta,225,0.428281,GA,147,0.430986,RA Sushi Bar Restaurant,31,-1.573665,2940
9299,LOCATION#GENERAL,1.0,s7baMoiG1821_3NblCfK6w,atlanta,1,-0.296,GA,147,0.430986,RA Sushi Bar Restaurant,2,-3.617388,39
9302,SERVICE#GENERAL,2.318182,s7baMoiG1821_3NblCfK6w,atlanta,44,0.257286,GA,147,0.430986,RA Sushi Bar Restaurant,38,-2.054941,2383
7658,AMBIENCE#GENERAL,5.0,hmrRb7qX3K705MuxHHfgNA,atlanta,12,0.8994,GA,122,0.421308,Cafe Intermezzo - Midtown,61,2.779898,350
7659,FOOD#QUALITY,3.799127,hmrRb7qX3K705MuxHHfgNA,atlanta,229,0.754661,GA,122,0.421308,Cafe Intermezzo - Midtown,110,1.418525,1362
7661,SERVICE#GENERAL,4.25,hmrRb7qX3K705MuxHHfgNA,atlanta,28,0.676843,GA,122,0.421308,Cafe Intermezzo - Midtown,112,1.657531,829
2198,FOOD#QUALITY,4.153846,B_6V-u97NU8aai4mrKqwNw,atlanta,26,0.70825,GA,3,0.417567,Silverlake Ramen,143,2.428355,770
2200,SERVICE#GENERAL,4.0,B_6V-u97NU8aai4mrKqwNw,atlanta,4,0.49945,GA,3,0.417567,Silverlake Ramen,96,0.958456,1165
8059,FOOD#QUALITY,4.270833,kD1mjIbvczeXBxDSluJaOw,atlanta,48,0.768323,GA,10,0.49343,DUA Vietnamese Noodle Soup,154,2.826341,529


In [0]:
service=[2383,829,1165,909,359,2872,358,1291,2440]
food=[2940,1362,770,529,417,2115,3561,1865,3141]
ambience=[1394,350,"-","-",1388,784,"-",1226,1490]
location=[39,"-","-","-","-","-","-","-","-"]
name=["RA Sushi Bar Restaurant","Cafe Intermezzo - Midtown","Silverlake Ramen","DUA Vietnamese Noodle Soup","Sam’s Of San Francisco","Rí Rá Irish Pub","Ribalta","The Federal","Sugar Factory"]


In [0]:
business_rank=pd.DataFrame(list(zip(name,service, food,ambience,location)),
              columns=['Business',"Service",'Food',"Ambience","Location"])
business_rank

Unnamed: 0,Business,Service,Food,Ambience,Location
0,RA Sushi Bar Restaurant,2383,2940,1394,39
1,Cafe Intermezzo - Midtown,829,1362,350,-
2,Silverlake Ramen,1165,770,-,-
3,DUA Vietnamese Noodle Soup,909,529,-,-
4,Sam’s Of San Francisco,359,417,1388,-
5,Rí Rá Irish Pub,2872,2115,784,-
6,Ribalta,358,3561,-,-
7,The Federal,1291,1865,1226,-
8,Sugar Factory,2440,3141,1490,-


In [0]:
fig = go.Figure(data=[go.Table(
    header=dict(values=('Business',"Service",'Quality of Food',"Ambience","Location"),
                fill_color='bisque',
                align='left'),
    cells=dict(values=[business_rank.Business, business_rank.Service,business_rank.Food,business_rank.Ambience,business_rank.Location],
               fill_color='whitesmoke',
               align='left'))
])

fig.show()

In [0]:
ra_sushi = aspect_GA[aspect_GA.business_id=="s7baMoiG1821_3NblCfK6w"]
business_2 = aspect_GA[aspect_GA.business_id=='hmrRb7qX3K705MuxHHfgNA']
business_3 = aspect_GA[aspect_GA.business_id=='B_6V-u97NU8aai4mrKqwNw']
business_4 = aspect_GA[aspect_GA.business_id=='kD1mjIbvczeXBxDSluJaOw']
business_5 = aspect_GA[aspect_GA.business_id== '6Rq3Bcs969L60aoMGIs5FQ']
business_6 = aspect_GA[aspect_GA.business_id=='7Gub6fxNR1kkPp2MKm2ikw']
business_7 = aspect_GA[aspect_GA.business_id=='MeQegzmVVKNuu-ojlszjRQ']
business_8 = aspect_GA[aspect_GA.business_id=='_vocxUlPxxOafG18HE_B-A']
business_9 = aspect_GA[aspect_GA.business_id=='CcIS8RDNCtJUbxCduIEyAg']
all_business = pd.concat([ra_sushi, business_2,business_3,business_4,business_5,business_6,business_7,business_8,business_9])

In [0]:
all_business_service = all_business[all_business.aspect_category=='SERVICE#GENERAL']
all_business_amb = all_business[all_business.aspect_category=='AMBIENCE#GENERAL']
all_business_food = all_business[all_business.aspect_category=='FOOD#QUALITY']

In [0]:
fig = px.scatter_matrix(all_business_service, dimensions=["ave_stars", "review_ss_x", "tips_count", "tips_ss"], color="name",width=1500,height=1000)
fig.update_traces(showupperhalf=False)
fig.show()

In [0]:
fig = px.scatter_matrix(all_business_food, dimensions=["ave_stars", "review_ss_x", "tips_count", "tips_ss"], color="name",width=1500,height=1000)
fig.update_traces(showupperhalf=False)
fig.show()

In [0]:
fig = px.scatter_matrix(all_business_amb, dimensions=["ave_stars", "review_ss_x", "tips_count", "tips_ss"], color="name",width=1500,height=1000)
fig.update_traces(showupperhalf=False)
fig.show()

In [0]:
# accessing cleaned file
rankDF = (spark.read
  .option("inferSchema", True)
  .json("/FileStore/tables/Project_Data_Extract/aspect_GA.json"))

aspect_GA=rankDF.toPandas()