In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pkl
import os
import sys
import warnings

from bokeh.plotting import figure, show, output_notebook

warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
output_notebook()

# Load Data

In [2]:
%%time
# ddf = dd.read_parquet("processed_data/joined.parquet.snappy", engine="pyarrow")
df = pd.read_parquet("processed_data/joined.parquet.snappy")
ddf = dd.from_pandas(df, npartitions=100)

CPU times: user 9.64 s, sys: 5.01 s, total: 14.6 s
Wall time: 14.2 s


In [3]:
ddf.head()
# df.head()

Unnamed: 0,r_id,b_id,u_id,r_stars,r_date,r_text,r_useful,r_funny,r_cool,b_stars,b_review_count,u_review_count,u_yelping_since,u_friends_count
0,4,6317,164431,5,2015-01-04 00:01:03,"Wow! Yummy, different, delicious. Our favo...",1,0,1,4.0,181,9,2014-01-17 19:20:57,1
1,9,914,154297,3,2016-03-30 22:46:33,This easter instead of going to Lopez Lake we ...,1,1,0,4.5,13,24,2015-10-27 22:53:34,1
2,11,3877,35412,5,2015-06-21 14:48:06,My experience with Shalimar was nothing but wo...,2,0,0,2.5,8,39,2009-01-27 21:20:30,7
3,18,12041,21756,4,2014-08-10 19:41:43,The hubby and I have been here on multiple occ...,1,0,0,4.0,398,74,2009-07-24 14:30:28,112
4,19,295,80128,5,2016-03-07 00:02:18,I go to blow bar to get my brows done by natal...,2,0,1,4.0,55,27,2012-02-26 05:18:05,8


# Basic EDA

## Dataset Statisitics

In [4]:
ddf.describe().compute()
# df.describe()

Unnamed: 0,b_id,u_id,r_stars,r_useful,r_funny,r_cool,b_stars,b_review_count,u_review_count,u_friends_count
count,2576655.0,2576655.0,2576655.0,2576655.0,2576655.0,2576655.0,2576655.0,2576655.0,2576655.0,2576655.0
mean,74792.63,426628.5,3.607624,2.65125,0.641212,1.038119,3.742953,336.9155,191.3309,189.0629
std,43575.54,496360.9,1.538495,4.62532,2.446789,3.283233,0.7945866,665.8551,497.5412,567.4223
min,2.0,1.0,1.0,-1.0,-1.0,-1.0,1.0,5.0,0.0,1.0
25%,37681.0,100368.0,3.0,1.0,0.0,0.0,3.5,61.0,18.0,7.0
50%,75173.0,284425.0,4.0,2.0,0.0,1.0,4.0,185.0,97.0,76.0
75%,113020.0,890966.0,5.0,3.0,1.0,1.0,4.5,472.0,257.0,234.0
max,150346.0,1987895.0,5.0,1182.0,792.0,404.0,5.0,7568.0,17473.0,14995.0


## Missing Data Information

In [5]:
pd.DataFrame(ddf.isna().sum().compute() / ddf.isna().count().compute(), columns=["fraction_missing"])
# pd.DataFrame(df.isna().sum()/ df.isna().count(), columns=["fraction_missing"])

Unnamed: 0,fraction_missing
r_id,0.0
b_id,0.0
u_id,0.0
r_stars,0.0
r_date,0.0
r_text,0.0
r_useful,0.0
r_funny,0.0
r_cool,0.0
b_stars,0.0


## Histogram of review stars

In [6]:
def bokeh_hist(hist, edges, **kwargs):
    """ Function to plot the histogram in Bokeh """
    x = kwargs.get('x', None)
    pdf = kwargs.get('pdf', None)
    cdf = kwargs.get('cdf', None)
    title = kwargs.get('title', "Title")
    height = kwargs.get('height', 350)
    
    p = figure(height=height, title=title, x_axis_label=kwargs.get("x_label", "x"), y_axis_label=kwargs.get("y_label", "y"),
              y_axis_type=kwargs.get("y_axis_type", "linear"), y_range=kwargs.get("y_range", None))
    
    p.quad(bottom=0, top=hist, left=edges[:-1], right=edges[1:], fill_color=kwargs.get('fill_color', 'navy'), 
           line_color=kwargs.get('line_color', 'white'), 
           alpha=kwargs.get('alpha', 0.5))
    
    if pdf is not None and x is not None:
        p.line(x, pdf, line_color=kwargs.get('pdf_color', 'darkorange'), legend="PDF")
        
    if cdf is not None and x is not None:
        p.line(x, cdf, line_color=kwargs.get('cdf_color', 'forestgreen'), legend="CDF")
   
    p.legend.location = "center_right"
    show(p)
    
    
def bokeh_dist(x, pdf, **kwargs):
    """ Function to plot the distribution in Bokeh """
    
    title = kwargs.get('title', "Title")
    height = kwargs.get('height', 350)

    p = figure(height=height, title=title, x_axis_label=kwargs.get("x_label", "x"), y_axis_label=kwargs.get("y_label", "y"))

    p.line(x, pdf, line_color=kwargs.get('pdf_color', 'navy'), line_width=kwargs.get('line_width', 1.0))
    show(p)

In [7]:
r_star_hist, r_star_edge = np.histogram(ddf["r_stars"].compute().values, bins=4, range=(1, 5))
# r_star_hist, r_star_edge = np.histogram(df["r_stars"].values, bins=4, range=(1, 5))
bokeh_hist(r_star_hist, r_star_edge, title="Distribution of review stars", x_label="review stars", y_label="frequency", alpha=0.85)

As per the above histogram, the data consists of reviews mostly of ratings between $4$ and $5$. Second to that are the reviews with ratings between $1$ and $2$.

## Distribution of Useful ratings

In [44]:
def calc_pdf(data, attr, isdask=False):
    """ Function to calculate the Probability Density Function """
    if isdask:
        x = np.linspace(data[attr].min().compute(), data[attr].max().compute())
        mu, sigma = data[attr].mean().compute(), data[attr].std().compute()
        
    elif attr is None:
        x = np.linspace(np.min(data), np.max(data))
        mu, sigma = np.mean(x), np.std(x)
        
    else:
        x = np.linspace(data[attr].min(), data[attr].max())
        mu, sigma = data[attr].mean(), data[attr].std()
        
    pdf = 1/(sigma*np.sqrt(2 * np.pi)) * np.exp(-(x - mu) ** 2/(2 * sigma ** 2))
    
    return x, pdf

In [9]:
# x = np.linspace(0, ddf["r_useful"].max().compute())
# mu, sigma = ddf["r_useful"].mean().compute(), ddf["r_useful"].std().compute()

use_x, use_pdf = calc_pdf(ddf, "r_useful", isdask=True)

In [10]:
bokeh_dist(use_x, use_pdf, x_label="useful ratings", y_label="pdf", pdf_color="cornflowerblue", title="Distribution of Useful Ratings")

The `useful_ratings` is heavily *right-skewed*. Meaning most of instances has `r_useful` set to range between $1$ to $20$. There are only **few reviews** that are found to be *extremely* useful. So, the question is Does the **popularity** of the *reviewer* affects such behaviour? 

### Minimum Value of Useful Ratings

In [11]:
ddf["r_useful"].min().compute()
# df["r_useful"].min()

-1

In [12]:
# ddf[ddf["r_useful"] == ddf["r_useful"].min().compute()].compute().shape
print(f'Number of records with min value : {ddf[ddf["r_useful"] == ddf["r_useful"].min().compute()].compute().shape[0]}')

Number of records with min value : 4


It is weird that only $4$ instances of negative ratings are present in our data. Could this be a *data entry* problem? Or is there any downvoting functionality on *Yelp*?

### Maximum Value of Useful Ratings

In [13]:
ddf["r_useful"].max().compute()

1182

In [14]:
print(f'Number of records with max value: {ddf[ddf["r_useful"] == ddf["r_useful"].max().compute()].compute().shape[0]}')

Number of records with max value: 1


And there's only a single record with the maximum value. Is that review really helpful? Is it an old review?

## Distribution of Funny Ratings

In [15]:
fun_x, fun_pdf = calc_pdf(ddf, "r_funny", isdask=True)
bokeh_dist(fun_x, fun_pdf, x_label="funny ratings", y_label="pdf", pdf_color="tomato", title="Distribution of Useful Ratings")

The distribution of `r_funny` is similar to the distribution of `r_useful`. 

### Minimum Value of Funny Ratings

In [16]:
ddf["r_funny"].min().compute()

-1

In [17]:
print(f'Number of records with min value : {ddf[ddf["r_funny"] == ddf["r_funny"].min()].compute().shape[0]}')

Number of records with min value : 2


There are only $2$ instances where the minimum value of `r_funny` is $-1$. Is this some kind of a data entry?

### Maximum Value of Funny Ratings

In [18]:
ddf["r_funny"].max().compute()

792

In [19]:
print(f'Number of records with max value: {ddf[ddf["r_funny"] == ddf["r_funny"].max().compute()].compute().shape[0]}')

Number of records with max value: 1


Just like `r_useful`, the `r_funny` also has only a single record with the maximum value.

## Distribution of Cool Ratings

In [20]:
cool_x, cool_pdf = calc_pdf(ddf, "r_cool", isdask=True)
bokeh_dist(cool_x, cool_pdf, x_label="cool ratings", y_label="pdf", pdf_color="mediumseagreen", title="Distribution of Useful Ratings")

So, the ratings of `r_useful`, `r_cool`, `r_funny` almost have similar distributions.

### Minimum Value of Cool Ratings

In [21]:
ddf["r_cool"].min().compute()

-1

In [22]:
print(f'Number of records with min value : {ddf[ddf["r_cool"] == ddf["r_cool"].min()].compute().shape[0]}')

Number of records with min value : 1


There's only a single record with minimum value for `r_cool`

### Maximum Value of Cool Ratings

In [23]:
ddf["r_cool"].max().compute()

404

In [24]:
print(f'Number of records with min value : {ddf[ddf["r_cool"] == ddf["r_cool"].max()].compute().shape[0]}')

Number of records with min value : 1


The number of records with maximum value is also $1$ for `r_cool`

## Histogram of b_stars

In [25]:
b_stars_hist, b_stars_edges = np.histogram(ddf["b_stars"].compute(), range=(ddf["b_stars"].min().compute(), 
                                                                            ddf["b_stars"].max().compute()), bins=4)

bokeh_hist(b_stars_hist, b_stars_edges, title="Distribution of Business stars", x_label="business stars", y_label="frequency", 
           alpha=0.85, fill_color="orangered")

We can observe a *high-frequency* of businesses having star ratings between $4$ and $5$. There are less instances of businesses having star ratings between $1$ and $2$.

## Text Data Statistics and EDA

In [30]:
def calc_n_words(txt):
    return len(txt.split())

def calc_sen_len(txt):
    return len(txt)

def calc_avg_word_len(txt):
    words = txt.split()
    return sum([len(word) for word in words]) / len(words)

### Number of Words 

In [32]:
n_words = ddf.r_text.map(calc_n_words, meta=(None, "float")).compute()

In [35]:
n_words_hist, n_words_edges = np.histogram(n_words, bins=100, range=(np.min(n_words), np.max(n_words)))
bokeh_hist(n_words_hist, n_words_edges, title="Distribution of Word Count", x_label="no. of words", y_label="frequency", 
           alpha=0.85, fill_color="mediumslateblue")

The word count is right skewed. Meaning most of the review is composed with small number of words and there are very few instances for which the number of words $>500$ which is evident from the *heavy tail* at the end of the distribution.

### Sentence Length

In [37]:
sen_len = ddf.r_text.map(calc_sen_len, meta=(None, "int")).compute()

In [38]:
sen_len_hist, sen_len_edges = np.histogram(sen_len, bins=100, range=(np.min(sen_len), np.max(sen_len)))
bokeh_hist(sen_len_hist, sen_len_edges, title="Distribution of Sentence Length", x_label="sentence length", y_label="frequency", 
           alpha=0.85, fill_color="tomato")

The distribution of the *Sentence length* follows a similar distribution as we have observed for *word counts*. The data is right-skewed with a heavy-tail at the end.

### Average Word Length

In [39]:
avg_word_len = ddf.r_text.map(calc_avg_word_len, meta=(None, "float")).compute()

In [48]:
avg_word_len_x, avg_word_len_pdf = calc_pdf(avg_word_len, None, isdask=False)
bokeh_dist(avg_word_len_x, avg_word_len_pdf, 
           x_label="Avg word len", y_label="pdf", pdf_color="mediumseagreen", title="Distribution of Average Word Length")

The distribution of average word length follows a normal distribution, which is quite evident as the distribution of mean should always follow one.

So the questions for the text data are as follows:
* Does the long reviews have negative sentiments or postive sentiments? 
* Does the long reviews affect the star rating? 
* Are long reviews any good?