In [1]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt

from surprise import Dataset, Reader, accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from surprise.prediction_algorithms import SVD, SVDpp, NMF, BaselineOnly, NormalPredictor

%matplotlib inline

In [2]:
!ls Data

reviews_Beauty_5.json.gz


In [3]:
df = pd.read_json("Data/reviews_Beauty_5.json.gz",lines=True)

In [133]:
df[df["summary"]==" "]

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime


In [5]:
df.isna().sum()

reviewerID           0
asin                 0
reviewerName      1386
helpful              0
reviewText           0
overall              0
summary              0
unixReviewTime       0
reviewTime           0
dtype: int64

In [6]:
#1386 reviewerNames left blank; we will not need reviewerName since utilizing reviewerID but 
#curious why

In [7]:
df[df['reviewerName'].isnull()]

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
8,A3LMILRM9OC3SA,9759091062,,"[0, 0]",Did nothing for me. Stings when I put it on. I...,2,"no Lightening, no Brightening,......NOTHING",1405209600,"07 13, 2014"
1790,AK1H26O8DLMNN,B0000535UM,,"[0, 0]",The first thickening shampoo that works on my ...,5,Actually works,1405123200,"07 12, 2014"
2242,APTLHR9PHGPXN,B00005NAOD,,"[0, 0]","Kind of drying, not moisturizing. Kind of disa...",2,Inexpensive and feels that wY,1405209600,"07 13, 2014"
2304,AQWX644AFUFFK,B00005NFBD,,"[0, 0]","This is just ok. For one, I found this in a st...",3,"Ok, nothing amazing",1405468800,"07 16, 2014"
3651,A43K5ZRQ87TO6,B00008PC1O,,"[0, 0]",Works well and easy to use!,5,Five Stars,1405296000,"07 14, 2014"
...,...,...,...,...,...,...,...,...,...
197192,A1Z3AV93ONK5VF,B00KAL5JAU,,"[0, 0]",We already had the Dead Sea Shampoo by Adovia ...,5,"Non greasy, silky shiny hair",1401235200,"05 28, 2014"
197193,A184I8GT3BHZQV,B00KAL5JAU,,"[0, 1]",&#60;a href=&#34;http://www.tomoson.com/?code=...,5,Dead Sea Salt hair conditioner,1403568000,"06 24, 2014"
197194,A8C9EJORQD23,B00KAL5JAU,,"[0, 1]",I use this with the Adovia shampoo I mention a...,5,Adovia does natural right!,1402272000,"06 9, 2014"
198446,A2PIGZCDGM4NJ7,B00L5JHZJO,,"[10, 11]","This is a horrible product, most of the review...",1,Bad Product,1404864000,"07 9, 2014"


In [8]:
#all necessary information is included even when reviewerName is NaN

In [9]:
df.reviewerID.value_counts()

A2V5R832QCSOMX    204
ALNFHVS3SC4FV     192
AKMEY1BSHSDG7     182
A3KEZLJ59C1JVH    154
ALQGOMOY1F5X9     150
                 ... 
A2WQPYI2WI6U7X      5
A23164157UKPXV      5
A3PLXJACVMU6NZ      5
A1AFKC16E1HZ1S      5
AC5J3VHPGDSJW       5
Name: reviewerID, Length: 22363, dtype: int64

In [10]:
#YAY! all reviewerIDs have value of at least 5, total of 22,363 reviewers

In [11]:
df.asin.value_counts()

B004OHQR1Q    431
B0043OYFKU    403
B0069FDR96    391
B000ZMBSPE    389
B00150LT40    329
             ... 
B002WVG8MY      5
B001DXQMS2      5
B004SODRDG      5
B000FVHS4U      5
B0073FET84      5
Name: asin, Length: 12101, dtype: int64

In [12]:
#YAY! all products have at least 5 reviews, total of 12,101 different products

In [13]:
#can save data as a CSV file for easier use, but it does not look like we need to clean data
#can drop reviewerNames

In [14]:
lower_rating = df.overall.min()

In [15]:
upper_rating = df.overall.max()

In [16]:
print('Review range: {0} to {1}'.format(lower_rating, upper_rating))

Review range: 1 to 5


In [17]:
#Confirming our review range is 1 to 5, which is the default for surprise

In [18]:
#Creating dataframe with appropriate columns to run through surprise

In [19]:
surprise_df = df[['reviewerID', 'asin', 'overall']]

In [20]:
surprise_df

Unnamed: 0,reviewerID,asin,overall
0,A1YJEY40YUW4SE,7806397051,1
1,A60XNB876KYML,7806397051,3
2,A3G6XNM240RMWA,7806397051,4
3,A1PQFP6SAJ6D80,7806397051,2
4,A38FVHZTNQ271F,7806397051,3
...,...,...,...
198497,A2BLFCOPSMBOZ9,B00LLPT4HI,5
198498,A1UQBFCERIP7VJ,B00LLPT4HI,5
198499,A35Q0RBM3YNQNF,B00LLPT4HI,5
198500,A3LGT6UZL99IW1,B00LLPT4HI,5


In [21]:
reader = Reader(rating_scale=(1, 5))
surprise_data = Dataset.load_from_df(surprise_df, reader)

trainset, testset = train_test_split(surprise_data, test_size=0.2, random_state=23)

In [22]:
surprise_data

<surprise.dataset.DatasetAutoFolds at 0x7fc13772e8e0>

In [23]:
# How many users and items are in the trainset
print('Number of users: ', trainset.n_users, '\n')
print('Number of items: ', trainset.n_items, '\n')

Number of users:  22359 

Number of items:  12101 



In [24]:
#10,000 more users than items

In [25]:
print('Type trainset :',type(trainset),'\n')
print('Type testset :',type(testset))

Type trainset : <class 'surprise.trainset.Trainset'> 

Type testset : <class 'list'>


In [26]:
print(len(testset))
print(testset[0])

39701
('A32POM4ALTYIZV', 'B005Z4QT7E', 2.0)


In [27]:
print(len(df))

198502


In [28]:
198502 * .2

39700.4

In [29]:
#sanity check that test set is 20% of total data

In [30]:
#starting with item-item similarity

In [31]:
sim_cos = {'name':'cosine', 'user_based':False}

In [35]:
from surprise.prediction_algorithms import knns

In [36]:
basic = knns.KNNBasic(sim_options=sim_cos)
basic.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fc46e565520>

In [37]:
# looking at the similarity metrics of each of the items to one another by using the sim attribute of our fitted model

In [38]:
basic.sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [39]:
predictions = basic.test(testset)

In [40]:
print(accuracy.rmse(predictions))

RMSE: 1.2129
1.2128933192100995


In [41]:
#RMSE of about 1.2, meaning that it was off by roughly 1.2 points for each guess it made for ratings

In [42]:
sim_cos_user = {'name':'cosine', 'user_based':True}

In [43]:
basic_user = knns.KNNBasic(sim_options=sim_cos_user)
basic_user.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fc52e4e5c70>

In [44]:
basic_user.sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [45]:
predictions_user = basic_user.test(testset)

In [46]:
print(accuracy.rmse(predictions_user))

RMSE: 1.2401
1.2400588428109054


In [47]:
#KNN model with Pearson Correlation 

In [48]:
sim_pearson = {'name':'pearson', 'user_based':False}
basic_pearson = knns.KNNBasic(sim_options=sim_pearson)
basic_pearson.fit(trainset)
predictions = basic_pearson.test(testset)
print(accuracy.rmse(predictions))

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.1854
1.1854039491870065


In [49]:
sim_pearson_user = {'name':'pearson', 'user_based':True}
basic_pearson_user = knns.KNNBasic(sim_options=sim_pearson_user)
basic_pearson_user.fit(trainset)
predictions_user = basic_pearson_user.test(testset)
print(accuracy.rmse(predictions_user))

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.1795
1.1795381227887218


In [50]:
#KNN model with Means

In [51]:
#same thing as the basic KNN model, except it takes into account the mean rating of each user or item depending on 
#whether you are performing user-user or item-item similarities, respectively.

In [52]:
sim_pearson = {'name':'pearson', 'user_based':False}
knn_means = knns.KNNWithMeans(sim_options=sim_pearson)
knn_means.fit(trainset)
predictions = knn_means.test(testset)
print(accuracy.rmse(predictions))

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.1791
1.1790736593254243


In [53]:
sim_pearson_user = {'name':'pearson', 'user_based':True}
knn_means_user = knns.KNNWithMeans(sim_options=sim_pearson_user)
knn_means_user.fit(trainset)
predictions_user = knn_means_user.test(testset)
print(accuracy.rmse(predictions_user))

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.1622
1.162184545973618


In [54]:
#This is our best model so far

In [55]:
#Running an SVD model with defaults

In [56]:
svd = SVD(random_state=42)
svd.fit(trainset)
predictions = svd.test(testset)
print(accuracy.rmse(predictions))

RMSE: 1.0889
1.0889451149217502


In [57]:
#Lowest RMSE so far!

In [58]:
svd.predict('A1YJEY40YUW4SE', 'B00LLPT4HI')

Prediction(uid='A1YJEY40YUW4SE', iid='B00LLPT4HI', r_ui=None, est=4.41104541567184, details={'was_impossible': False})

In [59]:
svd.predict('A2BLFCOPSMBOZ9', '7806397051')

Prediction(uid='A2BLFCOPSMBOZ9', iid='7806397051', r_ui=None, est=3.774499861592997, details={'was_impossible': False})

In [60]:
#Checking to see estimated rating for 2 user/product combinations

In [61]:
trainset

<surprise.trainset.Trainset at 0x7fc46de06af0>

In [62]:
cv_svd_baseline = cross_validate(svd, surprise_data)

In [63]:
cv_svd_baseline

{'test_rmse': array([1.08870948, 1.0833853 , 1.09177078, 1.09185146, 1.08971124]),
 'test_mae': array([0.83439172, 0.82976812, 0.8368646 , 0.8375184 , 0.83600292]),
 'fit_time': (8.10701298713684,
  8.074291229248047,
  8.08345103263855,
  8.2220458984375,
  8.068026065826416),
 'test_time': (0.19235897064208984,
  0.1955568790435791,
  0.20186996459960938,
  0.19525408744812012,
  0.18959999084472656)}

In [64]:
df

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1YJEY40YUW4SE,7806397051,Andrea,"[3, 4]",Very oily and creamy. Not at all what I expect...,1,Don't waste your money,1391040000,"01 30, 2014"
1,A60XNB876KYML,7806397051,Jessica H.,"[1, 1]",This palette was a decent price and I was look...,3,OK Palette!,1397779200,"04 18, 2014"
2,A3G6XNM240RMWA,7806397051,Karen,"[0, 1]",The texture of this concealer pallet is fantas...,4,great quality,1378425600,"09 6, 2013"
3,A1PQFP6SAJ6D80,7806397051,Norah,"[2, 2]",I really can't tell what exactly this thing is...,2,Do not work on my face,1386460800,"12 8, 2013"
4,A38FVHZTNQ271F,7806397051,Nova Amor,"[0, 0]","It was a little smaller than I expected, but t...",3,It's okay.,1382140800,"10 19, 2013"
...,...,...,...,...,...,...,...,...,...
198497,A2BLFCOPSMBOZ9,B00LLPT4HI,Dave Edmiston,"[0, 0]",Just a little dab of this shea butter should b...,5,A little dab...,1405468800,"07 16, 2014"
198498,A1UQBFCERIP7VJ,B00LLPT4HI,Margaret Picky,"[0, 0]",This shea butter is completely raw and unrefin...,5,Pure organic raw shea butter,1405296000,"07 14, 2014"
198499,A35Q0RBM3YNQNF,B00LLPT4HI,M. Hill,"[0, 0]",The skin is the body's largest organ and it ab...,5,One Pound Organic Grade A Unrefined Shea Butter,1405468800,"07 16, 2014"
198500,A3LGT6UZL99IW1,B00LLPT4HI,"Richard C. Drew ""Anaal Nathra/Uthe vas Bethod...","[0, 0]",I have very dry elbows and knees. I have a to...,5,This stuff is amazing!,1405382400,"07 15, 2014"


In [65]:
#Baseline Models; true baseline should be NormalPredictor(); user_based=True

In [66]:
baseline = NormalPredictor()
baseline.fit(trainset)

<surprise.prediction_algorithms.random_pred.NormalPredictor at 0x7fc45b5f46d0>

In [67]:
predictions = baseline.test(testset)

In [68]:
baseline = accuracy.rmse(predictions)

RMSE: 1.4973


In [69]:
baseline2 = BaselineOnly()
baseline2.fit(trainset)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x7fc46b7fd280>

In [70]:
predictions2 = baseline2.test(testset)

In [71]:
baseline2 = accuracy.rmse(predictions2)

RMSE: 1.0890


### Alex: SVD

#### Attempt on new split

In [116]:
#Import cross_validate, SVD, and Grid Search from Suprise
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD
from surprise.model_selection import GridSearchCV

In [117]:
#Hold out 10% of data for validation
#Create a new surpise data class
svd_data = Dataset.load_from_df(surprise_df, reader)
raw_ratings_svd = svd_data.raw_ratings
# A = 90% of the data, B = 10% of the data
threshold = int(.9 * len(raw_ratings_svd))
A_raw_ratings_svd = raw_ratings_svd[:threshold]
B_raw_ratings_svd = raw_ratings_svd[threshold:]

In [118]:
# svd_data is now the set A
svd_data.raw_ratings = A_raw_ratings_svd

In [34]:
#Create a param grid for grid search
SVD_parm_grid = {'n_factors':[20,50,100,150],'n_epochs':[10,20,30],'biased':[True,False]}

In [80]:
#Instantiate our grid search & fit to set A
svd_grid_search = GridSearchCV(algo_class=SVD,param_grid=SVD_parm_grid,measures=['rmse'],cv=5)
svd_grid_search.fit(svd_data)

In [73]:
#https://surprise.readthedocs.io/en/stable/FAQ.html -> "How to save some data for unbiased accuracy estimation"

In [81]:
best_svd_algo = svd_grid_search.best_estimator['rmse']

In [87]:
svd_grid_search.best_params

{'rmse': {'n_factors': 20, 'n_epochs': 20, 'biased': True}}

In [82]:
# retrain on the whole set A
trainset_svd = svd_data.build_full_trainset()
best_svd_algo.fit(trainset_svd)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fc5288ae790>

In [83]:
predictions = best_svd_algo.test(trainset_svd.build_testset())
print('Biased accuracy on A,', end='   ')
accuracy.rmse(predictions)

Biased accuracy on A,   RMSE: 0.9390


0.938986292642642

In [86]:
# Compute unbiased accuracy on B
testset_svd = svd_data.construct_testset(B_raw_ratings_svd)  # testset is now the set B
predictions = best_svd_algo.test(testset_svd)
print('Unbiased accuracy on B,', end=' ')
accuracy.rmse(predictions)

Unbiased accuracy on B, RMSE: 0.9689


0.9688963063125187

#### Attempt new grid search params with lower n_factors
- Adding lower values for n_factors because our previous best estimator hit our lower limit of n_factors = 20

In [134]:
SVD_parm_grid = {'n_factors':[2,5,10,20],'n_epochs':[10,20,30],'biased':[True,False]}

In [135]:
#Instantiate our grid search & fit to set A
svd_grid_search = GridSearchCV(algo_class=SVD,param_grid=SVD_parm_grid,measures=['rmse'],cv=5)
svd_grid_search.fit(svd_data)

In [136]:
best_svd_algo = svd_grid_search.best_estimator['rmse']

In [137]:
svd_grid_search.best_params

{'rmse': {'n_factors': 2, 'n_epochs': 20, 'biased': True}}

In [None]:
SVD(n_factors=2,n_epochs=20,biased=True)

In [138]:
# retrain on the whole set A
trainset_svd = svd_data.build_full_trainset()
best_svd_algo.fit(trainset_svd)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fc183e625b0>

In [139]:
predictions = best_svd_algo.test(trainset_svd.build_testset())
print('Biased accuracy on A,', end='   ')
accuracy.rmse(predictions)

Biased accuracy on A,   RMSE: 0.9824


0.9823613584790697

In [140]:
# Compute unbiased accuracy on B
testset_svd = svd_data.construct_testset(B_raw_ratings_svd)  # testset is now the set B
predictions = best_svd_algo.test(testset_svd)
print('Unbiased accuracy on B,', end=' ')
accuracy.rmse(predictions)

Unbiased accuracy on B, RMSE: 0.9687


0.9687055612776263

#### Attempt new grid search params with regularization


In [None]:
SVD(

#### Attempt on trainset

In [39]:
#Instantiate our grid search & fit to trainset
svd_grid_search_ts = GridSearchCV(algo_class=SVD,param_grid=SVD_parm_grid,measures=['rmse'],cv=5,return_train_measures=True)
svd_grid_search_ts.fit(surprise_data)

In [47]:
svd_grid_search_ts.best_estimator

{'rmse': <surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fc102c68580>}

In [53]:
best_svd_algo = svd_grid_search_ts.best_estimator['rmse']

In [52]:
best_svd_algo

{'rmse': <surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fc102c68580>}

In [58]:
best_svd_algo.fit(trainset)
predictions = best_svd_algo.test(testset)
accuracy.rmse(predictions)

RMSE: 1.0836


1.0836471641265648

### Exploring users who rate everything a 5

In [81]:
high_ratings = df.groupby("reviewerID")["overall",].mean().sort_values(ascending=False)
high_ratings.value_counts()

5.000000    2822
4.000000    1615
4.200000     900
4.400000     863
4.600000     861
            ... 
3.648649       1
4.243243       1
3.694118       1
4.710000       1
4.048780       1
Name: overall, Length: 795, dtype: int64

In [72]:
high_ratings

reviewerID
A4UHZXSLMBWT2     5.0
A1ORLBQV893JF0    5.0
A2RJT3IE2T6KXJ    5.0
ANOJX4RAUJ9HL     5.0
A15QGN6UXJVW9G    5.0
                 ... 
A2MHHSACEJANSX    1.0
A1KLA02LZXAT46    1.0
A1GQLVT0SWAWU     1.0
A2DPSPXFJ507C0    1.0
A1W522Z24EPBJB    1.0
Name: overall, Length: 22363, dtype: float64

In [73]:
len(df)

198502

In [80]:
df["reviewerID"].value_counts()

A2V5R832QCSOMX    204
ALNFHVS3SC4FV     192
AKMEY1BSHSDG7     182
A3KEZLJ59C1JVH    154
ALQGOMOY1F5X9     150
                 ... 
A2WQPYI2WI6U7X      5
A23164157UKPXV      5
A3PLXJACVMU6NZ      5
A1AFKC16E1HZ1S      5
AC5J3VHPGDSJW       5
Name: reviewerID, Length: 22363, dtype: int64

In [83]:
df.groupby("reviewerID")["overall"].mean().sort_values().value_counts()

5.000000    2822
4.000000    1615
4.200000     900
4.400000     863
4.600000     861
            ... 
3.896104       1
2.416667       1
4.559322       1
3.662338       1
3.105263       1
Name: overall, Length: 795, dtype: int64

In [87]:
low_raters = df[df.groupby("reviewID”)[“overall”].mean()==1]

SyntaxError: EOL while scanning string literal (<ipython-input-87-0be5a2e74049>, line 1)

In [96]:
avg_rating_user = df.groupby("reviewerID")["overall","reviewerID"].mean().sort_values("overall")
avg_rating_user

  avg_rating_user = df.groupby("reviewerID")["overall","reviewerID"].mean().sort_values("overall")


Unnamed: 0_level_0,overall
reviewerID,Unnamed: 1_level_1
A1W522Z24EPBJB,1.0
A2DPSPXFJ507C0,1.0
A1GQLVT0SWAWU,1.0
A1KLA02LZXAT46,1.0
A2MHHSACEJANSX,1.0
...,...
A15QGN6UXJVW9G,5.0
ANOJX4RAUJ9HL,5.0
A2RJT3IE2T6KXJ,5.0
A1ORLBQV893JF0,5.0


In [101]:
count_rating_user = df.groupby("reviewerID")["overall","reviewerID"].count().sort_values("overall")
count_rating_user

  count_rating_user = df.groupby("reviewerID")["overall","reviewerID"].count().sort_values("overall")


Unnamed: 0_level_0,overall,reviewerID
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
A1TQ2IR4LZWJDI,5,5
A20H2Q6KA5D4B3,5,5
A3G6TBNZG4XC75,5,5
A20H1A2Q7LO87Z,5,5
A3G78MIYEFPHEC,5,5
...,...,...
ALQGOMOY1F5X9,150,150
A3KEZLJ59C1JVH,154,154
AKMEY1BSHSDG7,182,182
ALNFHVS3SC4FV,192,192


In [99]:
low_rating_user = avg_rating_user[avg_rating_user["overall"]==1.0]
low_rating_user

Unnamed: 0_level_0,overall
reviewerID,Unnamed: 1_level_1
A1W522Z24EPBJB,1.0
A2DPSPXFJ507C0,1.0
A1GQLVT0SWAWU,1.0
A1KLA02LZXAT46,1.0
A2MHHSACEJANSX,1.0
A2RJTIE73NPN3C,1.0
ASWIC85F71H4J,1.0
A2TBE0N8JN6H4K,1.0
A1GPPMHYM6SMEW,1.0


In [100]:
high_rating_user = avg_rating_user[avg_rating_user["overall"]==5.0]
high_rating_user

Unnamed: 0_level_0,overall
reviewerID,Unnamed: 1_level_1
A2FINIRQNXOTI,5.0
ATWS89FH6Y6S4,5.0
A16Q479PYT0G6N,5.0
A3OKW5VRXZG3OQ,5.0
A3O9Q3154FPZLL,5.0
...,...
A15QGN6UXJVW9G,5.0
ANOJX4RAUJ9HL,5.0
A2RJT3IE2T6KXJ,5.0
A1ORLBQV893JF0,5.0


In [108]:
high_rating_user_ids = list(high_rating_user.index)
high_rating_user_ids

['A2FINIRQNXOTI',
 'ATWS89FH6Y6S4',
 'A16Q479PYT0G6N',
 'A3OKW5VRXZG3OQ',
 'A3O9Q3154FPZLL',
 'A2FJRU6RKAEAYX',
 'A1I9YRWNWRO3BY',
 'A2FQ1Q3V31QLQB',
 'A8WJERXUE7P43',
 'A2FAVDANCZYKNX',
 'A3OK7MGXBAK4JM',
 'A1YUGBWG0G9REI',
 'A1BOW2E468Y7Q1',
 'A8TPE62B0UCCE',
 'A2FKG99KYKTZQO',
 'A3OQ4I8U6TYCVE',
 'A3OOIWLW0Y01C2',
 'A2FLXE8YW1LMBN',
 'A1BPCAYEK3T1V7',
 'A2FGQ0JGXYYW6D',
 'A2FM0GJWDCQNFD',
 'A8QXLDP4JBE3K',
 'A3OPVTWOL57FCW',
 'A2FM5AJ7QH33S0',
 'A2FOR1IJSX76LQ',
 'A3OP6HJMA7SKDB',
 'A2FI3GF4Z7QNO',
 'A3OHN0JHXJ9BLA',
 'A2FEJKLHXJY7JT',
 'A1BO0E7TFN58TS',
 'A1YX68282RPR0S',
 'A2FC89NUP5CL8B',
 'A16Q29GKE0WNW4',
 'A3OD3HNHVSYJCR',
 'ATVAMZS8V7AZ6',
 'ATZFZONPLXQJN',
 'A3O27SJDAU2UAG',
 'ATSVUBE2115N1',
 'A3NPO4ECDIMF92',
 'A2GFS17LT7K6QV',
 'ATFF039E40YCS',
 'A1YQZCX2K3ISX5',
 'A94FW86TF1HL7',
 'A1YRHLWLQYRXKH',
 'A3NSLQZTVCO7J6',
 'A3NSLW1M3469ST',
 'A2GDP615CA6E9J',
 'ATH3U4X61L8D5',
 'A2GD4LJFKOZ9YZ',
 'A93DR0MWWC2B7',
 'ATHI1A9N10E6A',
 'A930TA3RDIJRS',
 'ATHP3KFKOKX6Z',
 'A16KNLK

In [109]:
df_cleaned = df[~df.reviewerID.isin(high_rating_user_ids)]
df_cleaned

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1YJEY40YUW4SE,7806397051,Andrea,"[3, 4]",Very oily and creamy. Not at all what I expect...,1,Don't waste your money,1391040000,"01 30, 2014"
1,A60XNB876KYML,7806397051,Jessica H.,"[1, 1]",This palette was a decent price and I was look...,3,OK Palette!,1397779200,"04 18, 2014"
2,A3G6XNM240RMWA,7806397051,Karen,"[0, 1]",The texture of this concealer pallet is fantas...,4,great quality,1378425600,"09 6, 2013"
3,A1PQFP6SAJ6D80,7806397051,Norah,"[2, 2]",I really can't tell what exactly this thing is...,2,Do not work on my face,1386460800,"12 8, 2013"
4,A38FVHZTNQ271F,7806397051,Nova Amor,"[0, 0]","It was a little smaller than I expected, but t...",3,It's okay.,1382140800,"10 19, 2013"
...,...,...,...,...,...,...,...,...,...
198497,A2BLFCOPSMBOZ9,B00LLPT4HI,Dave Edmiston,"[0, 0]",Just a little dab of this shea butter should b...,5,A little dab...,1405468800,"07 16, 2014"
198498,A1UQBFCERIP7VJ,B00LLPT4HI,Margaret Picky,"[0, 0]",This shea butter is completely raw and unrefin...,5,Pure organic raw shea butter,1405296000,"07 14, 2014"
198499,A35Q0RBM3YNQNF,B00LLPT4HI,M. Hill,"[0, 0]",The skin is the body's largest organ and it ab...,5,One Pound Organic Grade A Unrefined Shea Butter,1405468800,"07 16, 2014"
198500,A3LGT6UZL99IW1,B00LLPT4HI,"Richard C. Drew ""Anaal Nathra/Uthe vas Bethod...","[0, 0]",I have very dry elbows and knees. I have a to...,5,This stuff is amazing!,1405382400,"07 15, 2014"


In [111]:
#Testing a few high rating user IDs to see if we succesfully dropped from our cleaned df
df_cleaned[df_cleaned["reviewerID"]== "A3OOIWLW0Y01C2"]

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime


In [113]:
#It worked!
df_cleaned[df_cleaned["reviewerID"]== "A2FM0GJWDCQNFD"]

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime


In [112]:
df[df["reviewerID"]== "A3OOIWLW0Y01C2"]

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
58231,A3OOIWLW0Y01C2,B001H9404A,"Amy... ""MoMmYof5""","[0, 0]","My 12 year old loves this eye shadow, its not...",5,Daughters Favorite...,1400716800,"05 22, 2014"
61529,A3OOIWLW0Y01C2,B001M9KX78,"Amy... ""MoMmYof5""","[0, 0]",I ordered this for Christmas for my husband an...,5,Hubby loves!,1401321600,"05 29, 2014"
89804,A3OOIWLW0Y01C2,B003H8180I,"Amy... ""MoMmYof5""","[1, 1]",Gosh I luv this lip stain. Its looks like I ha...,5,ohh myy...,1400889600,"05 24, 2014"
95593,A3OOIWLW0Y01C2,B003UY8WFS,"Amy... ""MoMmYof5""","[0, 0]",Entire makeup collection. I use it everyday. ...,5,I LOVE this..,1400716800,"05 22, 2014"
95964,A3OOIWLW0Y01C2,B003V265QW,"Amy... ""MoMmYof5""","[0, 0]",I ordered this as a Christmas present for all ...,5,Daughters LOVE!,1401321600,"05 29, 2014"
112873,A3OOIWLW0Y01C2,B004OJIZO6,"Amy... ""MoMmYof5""","[1, 1]",Saw and smelled this in the store. Loved it a...,5,My Son...,1370649600,"06 8, 2013"
