In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
vine_df = pd.read_csv('vine_table.csv')
vine_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
0,R1KKOXHNI8MSXU,4.0,0.0,0.0,N,Y
1,R26SP2OPDK4HT7,5.0,1.0,2.0,N,Y
2,RWQEDYAX373I1,5.0,0.0,0.0,N,Y
3,R231YI7R4GPF6J,5.0,0.0,0.0,N,Y
4,R3KO3W45DD0L1K,5.0,0.0,0.0,N,Y


In [3]:
vine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5906333 entries, 0 to 5906332
Data columns (total 6 columns):
 #   Column             Dtype  
---  ------             -----  
 0   review_id          object 
 1   star_rating        float64
 2   helpful_votes      float64
 3   total_votes        float64
 4   vine               object 
 5   verified_purchase  object 
dtypes: float64(3), object(3)
memory usage: 270.4+ MB


# Filter by total votes

To pick reviews that are more likely to be helpful

In [4]:
filtered_vine_df = vine_df[vine_df['total_votes'] >= 20]
filtered_vine_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
211,R35PT06NWP7LDP,5.0,30.0,32.0,N,N
344,R2P76PJFUGXBHO,1.0,6.0,45.0,N,N
372,RQW4AFOG9MR4Z,5.0,51.0,52.0,N,N
406,R2SMUEBMGLAJQK,5.0,29.0,36.0,N,Y
408,R1XVIZZALU5P6J,5.0,148.0,150.0,N,Y


# Find most helpful reviews

To retrieve all the rows where the number of `helpful_votes` divided by `total_votes` is equal to or greater than 50%

In [5]:
new_vine_df = filtered_vine_df.loc[filtered_vine_df.helpful_votes / filtered_vine_df.total_votes >= 0.5]
new_vine_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
211,R35PT06NWP7LDP,5.0,30.0,32.0,N,N
372,RQW4AFOG9MR4Z,5.0,51.0,52.0,N,N
406,R2SMUEBMGLAJQK,5.0,29.0,36.0,N,Y
408,R1XVIZZALU5P6J,5.0,148.0,150.0,N,Y
419,R11UFMG8M2488I,4.0,23.0,26.0,N,N


# Reviews written as part of the vine program (paid)

In [6]:
vine_reviews = new_vine_df[new_vine_df['vine'] == 'Y']
vine_reviews.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
50483,R6U9701C3BGO6,3.0,139.0,147.0,Y,N
50625,R1XK3ALB45D7N4,5.0,33.0,34.0,Y,N
50635,R1IZCSTLX81D6C,5.0,31.0,33.0,Y,N
50656,R2C8NC8EQLH4JF,3.0,45.0,48.0,Y,N
50890,R1JJ1YOJMOML1P,5.0,18.0,21.0,Y,N


# Reviews not written as part of the vine program (unpaid)

In [7]:
not_vine_reviews = new_vine_df[new_vine_df['vine'] == 'N']
not_vine_reviews.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
211,R35PT06NWP7LDP,5.0,30.0,32.0,N,N
372,RQW4AFOG9MR4Z,5.0,51.0,52.0,N,N
406,R2SMUEBMGLAJQK,5.0,29.0,36.0,N,Y
408,R1XVIZZALU5P6J,5.0,148.0,150.0,N,Y
419,R11UFMG8M2488I,4.0,23.0,26.0,N,N


# 5-Star Review Analysis: 

Comparing the vine and non-vine program

In [8]:
count_vine = len(vine_reviews)
count_vine

33

In [9]:
count_nonvine = len(not_vine_reviews)
count_nonvine

45388

Out of all the reviews, there were **33** paid-vine reviews and **45,388** unpaid reviews.

In [10]:
max_rating_vine = len(vine_reviews[vine_reviews['star_rating']==5])
max_rating_vine

15

In [11]:
max_rating_nonvine = len(not_vine_reviews[not_vine_reviews['star_rating']==5])
max_rating_nonvine

23733

Out of those reviews, 15 of the vine reviews were **5** star and **23,733** were non-paid and 5 star

### 5-Star Review percentage by program

*5-star Vine Reviews:*

In [12]:
max_rating_vine/count_vine

0.45454545454545453

In [13]:
max_rating_nonvine/count_nonvine

0.5228915131752886

## Results: 

- Vine (paid) Reviews
    - 33 total reviews
    - 15 were 5 star reviews
    - ***45.5%*** of vine (paid) reviews were 5 star

- Unpaid Reviews
    - 45,388 total reviews
    - 23,733 5 star reviews 
    - ***52.3%*** of unpaid reviews were 5 star




## Further Analysis


Average star rating of reviews per program (paid and unpaid) 

- Paid Vine Program

In [23]:
round(vine_reviews['star_rating'].mean(),2)

4.09

- Unpaid Program

In [24]:
round(not_vine_reviews['star_rating'].mean(),2)

3.87