In [1]:
import pandas as pd
import csv

In [2]:
file_to_load = "Resources/vine_table.csv"

vine_table_df = pd.read_csv(file_to_load)

In [3]:
vine_table_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
0,R1UUISQ1GKOJTI,1.0,0.0,0.0,N,Y
1,R1HOJ9WE8VCVOD,5.0,9.0,9.0,N,Y
2,RDNGVXMWQN2TN,5.0,0.0,0.0,N,Y
3,R3OM9COQMVTDJ2,2.0,0.0,0.0,N,Y
4,R3OFUQVR4Y80Q9,4.0,0.0,0.0,N,N


In [4]:
vine_table_df.dtypes

review_id             object
star_rating          float64
helpful_votes        float64
total_votes          float64
vine                  object
verified_purchase     object
dtype: object

In [5]:
# filter the rows that have a total votes count is  greater or equal to 20
reviews_df = vine_table_df.loc[(vine_table_df["total_votes"] >= 20)]
reviews_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
485,R27XX9MG7OWSSM,5.0,49.0,51.0,N,Y
569,R1I4IFDOW6IW9O,3.0,44.0,47.0,N,Y
735,R2LZAV3VMLGPGS,3.0,32.0,33.0,N,Y
860,R3FITR2BQI94RA,5.0,33.0,38.0,N,Y
1172,RWYBT1R3KPD2M,4.0,142.0,163.0,N,Y


In [6]:
reviews_df.count()

review_id            96370
star_rating          96370
helpful_votes        96370
total_votes          96370
vine                 96370
verified_purchase    96370
dtype: int64

In [7]:
# create a new dataframe with the rows where the ratio of helpful votes to total votes is over 50%
helpful_reviews_df = reviews_df.loc[((vine_table_df["helpful_votes"] / vine_table_df["total_votes"]) >= .5)]
helpful_reviews_df

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
485,R27XX9MG7OWSSM,5.0,49.0,51.0,N,Y
569,R1I4IFDOW6IW9O,3.0,44.0,47.0,N,Y
735,R2LZAV3VMLGPGS,3.0,32.0,33.0,N,Y
860,R3FITR2BQI94RA,5.0,33.0,38.0,N,Y
1172,RWYBT1R3KPD2M,4.0,142.0,163.0,N,Y
...,...,...,...,...,...,...
6221548,R64NF3LSM7EAS,4.0,51.0,54.0,N,N
6221549,R35KSIYUDVCNOP,2.0,31.0,34.0,N,N
6221550,R1YURS8641JH3W,4.0,200.0,225.0,N,N
6221556,R3R0Q2BNYNFW6T,5.0,28.0,29.0,N,N


In [8]:
# create a new dataframe for all the helpful reviews that were part of the paid vine program (vine = Y)
helpful_vine_reviews_df = helpful_reviews_df.loc[(vine_table_df["vine"] == 'Y')]
helpful_vine_reviews_df

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
4518,R2W1OMYJERBXJ1,5.0,20.0,23.0,Y,N
31209,R31VJCBBKU13Y,5.0,24.0,27.0,Y,N
39564,R350JNDV0INC11,4.0,388.0,398.0,Y,Y
45246,R2NJA2I5C9Y0FR,3.0,63.0,73.0,Y,N
73839,R296RSG25RD1IX,3.0,46.0,51.0,Y,N
...,...,...,...,...,...,...
6099084,R7BDUPVERD62E,1.0,21.0,22.0,Y,N
6099271,R17GTQN5XA1733,5.0,67.0,70.0,Y,N
6099872,RDFV6BRK2FZ2J,2.0,54.0,54.0,Y,N
6104794,R117VHPG0RT6VE,5.0,37.0,40.0,Y,N


In [9]:
# create a new dataframe for all the helpful reviews that were not part of the paid vine program (vine = Y)
helpful_non_vine_reviews_df = helpful_reviews_df.loc[(vine_table_df["vine"] == 'N')]
helpful_non_vine_reviews_df

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
485,R27XX9MG7OWSSM,5.0,49.0,51.0,N,Y
569,R1I4IFDOW6IW9O,3.0,44.0,47.0,N,Y
735,R2LZAV3VMLGPGS,3.0,32.0,33.0,N,Y
860,R3FITR2BQI94RA,5.0,33.0,38.0,N,Y
1172,RWYBT1R3KPD2M,4.0,142.0,163.0,N,Y
...,...,...,...,...,...,...
6221548,R64NF3LSM7EAS,4.0,51.0,54.0,N,N
6221549,R35KSIYUDVCNOP,2.0,31.0,34.0,N,N
6221550,R1YURS8641JH3W,4.0,200.0,225.0,N,N
6221556,R3R0Q2BNYNFW6T,5.0,28.0,29.0,N,N


In [10]:
helpful_vine_reviews_df.dtypes

review_id             object
star_rating          float64
helpful_votes        float64
total_votes          float64
vine                  object
verified_purchase     object
dtype: object

In [22]:
# determine the total number of reviews, number of 5-star reviews, and percentage of 5-star reviews for Vine reviews
vine_review_count = len(helpful_vine_reviews_df)
vine_5_star_reviews = helpful_vine_reviews_df.loc[helpful_vine_reviews_df["star_rating"] == 5.0]
vine_5_star_count = len(vine_5_star_reviews)
percent_vine_5_star = round(vine_5_star_count / vine_review_count * 100)

print(f"There were a total of {vine_review_count} reviews from the Vine program.")
print(f"There were a total of {vine_5_star_count} 5-star reviews from the Vine program.")
print(f"Of the votes from the Vine program, {percent_vine_5_star}% were 5-star reviews.")

There were a total of 1448 reviews from the Vine program.
There were a total of 647 5-star reviews from the Vine program.
Of the votes from the Vine program, 45% were 5-star reviews.


In [23]:
# determine the total number of reviews, number of 5-star reviews, and percentage of 5-star reviews for non-Vine reviews
non_vine_review_count = len(helpful_non_vine_reviews_df)
non_vine_5_star_reviews = helpful_non_vine_reviews_df.loc[(helpful_non_vine_reviews_df["star_rating"]) == 5.0]
non_vine_5_star_count = len(non_vine_5_star_reviews)
percent_non_vine_5_star = round(non_vine_5_star_count / non_vine_review_count *100)

print(f"There were a total of {non_vine_review_count} reviews from reviewers not participating in the Vine program.")
print(f"There were a total of {non_vine_5_star_count} 5-star reviews from reviewers not participating in the Vine program.")
print(f"Of the votes from revieweres who did not participate in the Vine program, {percent_non_vine_5_star}% were 5-star reviews.")

There were a total of 90768 reviews from reviewers not participating in the Vine program.
There were a total of 44104 5-star reviews from reviewers not participating in the Vine program.
Of the votes from revieweres who did not participate in the Vine program, 49% were 5-star reviews.
