### Load the Vine Table CSV file into a DataFrame

In [3]:
# Import your dependencies.
import pandas as pd

In [5]:
# Read in the vine table csv file. 
vine_df = pd.read_csv("vine_table.csv")
vine_df

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
0,R3O9SGZBVQBV76,5,0,0,N,Y
1,RKH8BNC3L5DLF,5,0,0,N,Y
2,R2HLE8WKZSU3NL,2,1,1,N,Y
3,R31U3UH5AZ42LL,5,0,0,N,Y
4,R2SV659OUJ945Y,4,0,0,N,Y
...,...,...,...,...,...,...
960867,REH8UQZAXQS40,2,17,17,N,N
960868,RJ14QPZEOI9P8,5,9,10,N,N
960869,R2B1G5650WWFCE,5,3,16,N,N
960870,R2MMGPUWXXOFI2,4,0,0,N,N


In [6]:
# Check the datatypes
vine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 960872 entries, 0 to 960871
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   review_id          960872 non-null  object
 1   star_rating        960872 non-null  int64 
 2   helpful_votes      960872 non-null  int64 
 3   total_votes        960872 non-null  int64 
 4   vine               960872 non-null  object
 5   verified_purchase  960872 non-null  object
dtypes: int64(3), object(3)
memory usage: 44.0+ MB


### Filter by votes

In [7]:
# Create a new DataFrame that retrieves all the rows where the total votes is equal to or greater than 20.
df1 = vine_df.loc[vine_df["total_votes"] >= 20]
df1

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
478,R14W2VCHHK5V7W,5,19,20,N,Y
552,R1S3T57O3OYT5S,5,19,20,N,Y
619,R1BTWIBLYYVOV7,5,30,30,N,Y
1025,R6F9VY91ADPLA,1,8,30,N,N
1548,R3PXNV89DFIXKV,5,35,37,N,Y
...,...,...,...,...,...,...
960838,R7NJYD6L80A8,5,20,21,N,N
960851,R1721LXOWD6H3C,1,11,27,N,N
960860,R37CF8J75KF7ZR,2,35,66,N,N
960863,R1W6TRBVZSPGCJ,1,20,20,N,N


In [8]:
#  Create a new DataFrame that retrieves all the rows where 
# the number of helpful votes divided by total votes is equal to or greater than 0.5
df2 = df1.loc[(df1["helpful_votes"]/df1["total_votes"] >= 0.5)]
df2

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
478,R14W2VCHHK5V7W,5,19,20,N,Y
552,R1S3T57O3OYT5S,5,19,20,N,Y
619,R1BTWIBLYYVOV7,5,30,30,N,Y
1548,R3PXNV89DFIXKV,5,35,37,N,Y
1573,R2ZF9NYVT3J7D6,5,19,22,N,Y
...,...,...,...,...,...,...
960811,R3C9623WIPWVMB,5,56,58,N,N
960838,R7NJYD6L80A8,5,20,21,N,N
960860,R37CF8J75KF7ZR,2,35,66,N,N
960863,R1W6TRBVZSPGCJ,1,20,20,N,N


### Analyze the Vine Reviews

In [9]:
# df1.shape
# df1["review_id"].count()
df2.review_id.count()

8409

In [10]:
# Create a DataFrame that retrieves all the rows where a review was written as part of the Vine program (vine == Y).
paid_df = df2.loc[df2["vine"]=="Y"]
paid_df.head(10)

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
4122,R1B7M0OP3UNP6O,5,49,52,Y,N
334756,R2UUV4UGGYMQG8,5,34,39,Y,N
428946,R9K0LZV2BK9YY,4,37,39,Y,N
477323,R2OVFLNEUEGTJM,3,18,25,Y,N
664604,RBE09ELJ77LQ0,5,44,45,Y,N
664708,R3867T8AIJJHM6,5,26,27,Y,N
670569,R1FNVUXPU63WOZ,4,43,48,Y,N
675842,R25XGG2G12SE1Z,4,20,23,Y,N
675878,R3JKU4HRDFZDH,4,27,30,Y,N
675929,R2PQYOCJXRB1BF,5,26,28,Y,N


In [11]:
# Create a DataFrame that retrieves all the rows where a review wasn't written as part of the Vine program (vine == N).
unpaid_df = df2.loc[df2["vine"]=="N"]
unpaid_df

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
478,R14W2VCHHK5V7W,5,19,20,N,Y
552,R1S3T57O3OYT5S,5,19,20,N,Y
619,R1BTWIBLYYVOV7,5,30,30,N,Y
1548,R3PXNV89DFIXKV,5,35,37,N,Y
1573,R2ZF9NYVT3J7D6,5,19,22,N,Y
...,...,...,...,...,...,...
960811,R3C9623WIPWVMB,5,56,58,N,N
960838,R7NJYD6L80A8,5,20,21,N,N
960860,R37CF8J75KF7ZR,2,35,66,N,N
960863,R1W6TRBVZSPGCJ,1,20,20,N,N


### Determine the percentage of five-star reviews among Vine reviews

In [12]:
paid_five_star_number = paid_df.loc[(paid_df['star_rating']== 5)]["star_rating"].count()
type(paid_five_star_number)

numpy.int32

In [21]:
# Retrieve the number of 5 star ratings from the DataFrame that has a written review.
paid_five_star_number = paid_df.loc[(paid_df['star_rating']== 5)]["star_rating"].count()

# Retrieve the total number of star ratings from the DataFrame that has a written review.
paid_number = paid_df["star_rating"].count()

# Calculate the percentage of five star reviews.
percentage_five_star_vine = paid_five_star_number / paid_number * 100

# Print the results. 
print(paid_number)
print(paid_five_star_number)
print(f"{round(percentage_five_star_vine,2)}%")

47
15
31.91%


### Determine the percentage of five-star reviews among non-Vine reviews

In [14]:
# Retrieve the number of 5 star ratings from the DataFrame that doesn't have a written review.
unpaid_five_star_number = unpaid_df.loc[(unpaid_df['star_rating']== 5)]["star_rating"].count()

# Retrieve the total number of star ratings from the DataFrame that doesn't have a written review.
unpaid_number = unpaid_df["star_rating"].count()

# Calculate the percentage of five star reviews.
percentage_five_star_non_vine = unpaid_five_star_number / unpaid_number * 100

# Print the results. 
print(unpaid_number)
print(unpaid_five_star_number)
print(f"{round(percentage_five_star_non_vine,2)}%")

8362
4332
51.81%
