In [1]:
import numpy as np
import pandas as pd
#import re
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data_file = "~/Documents/python/project02/amazon_musical_instru_star_rev_only.tsv"
col_list = ["star_rating","review_body"]
#reviews_df = pd.read_csv(data_file, sep="\t", usecols=col_list, low_memory=False)
reviews_df = pd.read_csv(data_file, sep="\t", usecols=col_list, nrows=20000)

In [3]:
reviews_df['star_rating'] = reviews_df['star_rating'].fillna(0)
reviews_df['review_body'] = reviews_df['review_body'].fillna('')

In [None]:
reviews_df.head(15)

In [4]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   star_rating  20000 non-null  int64 
 1   review_body  20000 non-null  object
dtypes: int64(1), object(1)
memory usage: 312.6+ KB


In [None]:
reviews_df.tail(10)

In [5]:
# stats on the star rating for the 20,000 records
print("Min star:", reviews_df['star_rating'].min())
print("Max star:", reviews_df['star_rating'].max())
print("Average star:", reviews_df['star_rating'].mean())
print("Median star:", reviews_df['star_rating'].median())

Min star: 1
Max star: 5
Average star: 4.28765
Median star: 5.0


In [6]:
def count_words(data):
    return len(data.split())

In [7]:
def count_excla(data):
    return data.count("!")

In [8]:
def count_ques(data):
    return data.count("?")

In [9]:
# removes special characters are some "<br />" and returns list of words
def remove_special_chars(data):
    word_list = []
    cleaned_list = []
    alphanum = ""
    data = data.replace("<br />"," ") #replace <br /> text in some reviews with space
    word_list = data.lower().split()
    
    for word in word_list:
        alphanum = [character for character in word if character.isalnum()]
        alphanum = "".join(alphanum)
        cleaned_list.append(alphanum)
        
    cleaned_list = list(filter(None, cleaned_list)) #remove Null values
    
    return cleaned_list

In [10]:
# returns a dictionary with the word and number of occurances
def get_uniq_words(data):
    word_counter = {}
    for word in data:
        word_counter[word] = word_counter.get(word, 0) + 1
        
    return(word_counter)

In [11]:
# Count number of words for each review and add a column to the dataframe
word_count = []
excla_count = []
ques_count = []
for index, row in reviews_df.iterrows():
    word_count.append(count_words(reviews_df.iloc[index]['review_body']))
    excla_count.append(count_excla(reviews_df.iloc[index]['review_body']))
    ques_count.append(count_ques(reviews_df.iloc[index]['review_body']))
    
reviews_df["num_of_words"] = word_count
reviews_df["num_exclamations_marks"] = excla_count
reviews_df["num_questions_marks"]= ques_count

In [12]:
# Stats on Word Count for all reviews
print("Stats on word count for all reviews")
print("-----------------------------------")
print("Min number of words:", reviews_df['num_of_words'].min())
print("Max number of words:", reviews_df['num_of_words'].max())
print("Average number of words:", reviews_df['num_of_words'].mean())
print("Median number of words:", reviews_df['num_of_words'].median())

Stats on word count for all reviews
-----------------------------------
Min number of words: 0
Max number of words: 2163
Average number of words: 41.6003
Median number of words: 17.0


In [13]:
# Stats on number of exclamation and question marks for all reviews
print("Stats on exclamation marks for all reviews")
print("-----------------------------------")
print("Min number of exclamation marks:", reviews_df['num_exclamations_marks'].min())
print("Max number of exclamation marks:", reviews_df['num_exclamations_marks'].max())
print("Average number of exclamation marks:", reviews_df['num_exclamations_marks'].mean())
print("Median number of exclamation marks:", reviews_df['num_exclamations_marks'].median())
print('\n')
print("Stats on question marks for all reviews")
print("-----------------------------------")
print("Min number of question marks:", reviews_df['num_questions_marks'].min())
print("Max number of question marks:", reviews_df['num_questions_marks'].max())
print("Average number of question marks:", reviews_df['num_questions_marks'].mean())
print("Median number of question marks:", reviews_df['num_questions_marks'].median())

Stats on exclamation marks for all reviews
-----------------------------------
Min number of exclamation marks: 0
Max number of exclamation marks: 54
Average number of exclamation marks: 0.4231
Median number of exclamation marks: 0.0


Stats on question marks for all reviews
-----------------------------------
Min number of question marks: 0
Max number of question marks: 13
Average number of question marks: 0.0337
Median number of question marks: 0.0


In [14]:
# number of reviews for each star rating
reviews_df.groupby(['star_rating']).count()

Unnamed: 0_level_0,review_body,num_of_words,num_exclamations_marks,num_questions_marks
star_rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1482,1482,1482,1482
2,817,817,817,817
3,1402,1402,1402,1402
4,3064,3064,3064,3064
5,13235,13235,13235,13235


In [15]:
# analyzing One Star reviews
onestar_reviews_df = reviews_df[reviews_df['star_rating'] == 1]
onestar_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1482 entries, 17 to 19962
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   star_rating             1482 non-null   int64 
 1   review_body             1482 non-null   object
 2   num_of_words            1482 non-null   int64 
 3   num_exclamations_marks  1482 non-null   int64 
 4   num_questions_marks     1482 non-null   int64 
dtypes: int64(4), object(1)
memory usage: 69.5+ KB


In [16]:
# Stats on Word, Exclamation & question mark Count for one star reviews
print("Stats on word, Exclamation & question mark count for 1 Star Reviews")
print("----------------------------------------")
print("Min number of words:", onestar_reviews_df['num_of_words'].min())
print("Max number of words:", onestar_reviews_df['num_of_words'].max())
print("Average number of words:", onestar_reviews_df['num_of_words'].mean())
print("Median number of words:", onestar_reviews_df['num_of_words'].median())
print("Max number of Exclamation Marks:", onestar_reviews_df['num_exclamations_marks'].max())
print("Average number of Exclamation Marks:", onestar_reviews_df['num_exclamations_marks'].mean())
print("Median number of Exclamation Marks:", onestar_reviews_df['num_exclamations_marks'].median())
print("Max number of Question Marks:", onestar_reviews_df['num_questions_marks'].max())
print("Average number of Question Marks:", onestar_reviews_df['num_questions_marks'].mean())
print("Median number of Question Marks:", onestar_reviews_df['num_questions_marks'].median())

Stats on word, Exclamation & question mark count for 1 Star Reviews
----------------------------------------
Min number of words: 0
Max number of words: 1477
Average number of words: 44.152496626180834
Median number of words: 25.0
Max number of Exclamation Marks: 14
Average number of Exclamation Marks: 0.4149797570850202
Median number of Exclamation Marks: 0.0
Max number of Question Marks: 5
Average number of Question Marks: 0.0620782726045884
Median number of Question Marks: 0.0


In [None]:
del onestar_reviews_df['num_of_words']
onestar_reviews_df.info()

In [18]:
onestar_filtered_df = onestar_reviews_df.drop(onestar_reviews_df[(onestar_reviews_df['num_exclamations_marks'] == 0) & (onestar_reviews_df['num_questions_marks'] == 0)].index)
onestar_filtered_df.head()

Unnamed: 0,star_rating,review_body,num_of_words,num_exclamations_marks,num_questions_marks
558,1,These just arrived in the mail today and I was...,102,2,0
575,1,Made too cheap !,4,1,0
585,1,Doesn't fit blue yeti!!!!,4,4,0
614,1,"Strings kept breaking, bad buy!",5,1,0
619,1,The plastic collar for the hook broke without ...,42,1,0


In [26]:
onestar_filtered_df[onestar_filtered_df['num_exclamations_marks'] != 0].count()

star_rating               267
review_body               267
num_of_words              267
num_exclamations_marks    267
num_questions_marks       267
dtype: int64

In [28]:
onestar_filtered_df.tail(30)

Unnamed: 0,star_rating,review_body,num_of_words,num_exclamations_marks,num_questions_marks
18497,1,Do not buy this piano (the Legato). If you wan...,166,0,1
18529,1,iRig has the worst customer support. The Light...,208,0,1
18538,1,I can't believe how many 5 stars this unnatura...,167,4,0
18623,1,"Received this dolly, went to open it up and th...",93,2,0
18689,1,Broke after about a week of non continuous use...,43,0,1
18692,1,These really suck. Way too thick and don't so...,39,2,0
18768,1,Impossible to put all toghether!!!,5,3,0
18774,1,I purchased this for my wife back in December ...,398,3,0
18870,1,I DONT KNOW why the first chord is so loose an...,24,0,1
18902,1,This light started flickering after only five ...,69,0,1


In [None]:
# analyzing Two Star reviews
twostar_reviews_df = reviews_df[reviews_df['star_rating'] == 2]
twostar_reviews_df.info()

In [None]:
# Stats on Word Count for two star reviews
print("Stats on word, exclamation & question mark count for 2 Star Reviews")
print("---------------------------------------")
print("Min number of words:", twostar_reviews_df['num_of_words'].min())
print("Max number of words:", twostar_reviews_df['num_of_words'].max())
print("Average number of words:", twostar_reviews_df['num_of_words'].mean())
print("Median number of words:", twostar_reviews_df['num_of_words'].median())
print("Max number of exclamation marks:", twostar_reviews_df['num_exclamations_marks'].max())
print("Average number of exclamation marks:", twostar_reviews_df['num_exclamations_marks'].mean())
print("Median number of exclamation marks:", twostar_reviews_df['num_exclamations_marks'].median())
print("Max number of question marks:", twostar_reviews_df['num_questions_marks'].max())
print("Average number of question marks:", twostar_reviews_df['num_questions_marks'].mean())
print("Median number of question marks:", twostar_reviews_df['num_questions_marks'].median())

In [None]:
# analyzing Three Star reviews
threestar_reviews_df = reviews_df[reviews_df['star_rating'] == 3]
threestar_reviews_df.info()

In [None]:
# Stats on Word, exclamation & question mark Count for three star reviews
print("Stats on word, exclamation & question mark count for 3 Star Reviews")
print("------------------------------------------")
print("Min number of words:", threestar_reviews_df['num_of_words'].min())
print("Max number of words:", threestar_reviews_df['num_of_words'].max())
print("Average number of words:", threestar_reviews_df['num_of_words'].mean())
print("Median number of words:", threestar_reviews_df['num_of_words'].median())
print("Max number of exclamation marks:", threestar_reviews_df['num_exclamations_marks'].max())
print("Average number of exclamation marks:", threestar_reviews_df['num_exclamations_marks'].mean())
print("Median number of exclamation marks:", threestar_reviews_df['num_exclamations_marks'].median())
print("Max number of questions marks:", threestar_reviews_df['num_questions_marks'].max())
print("Average number of questions marks:", threestar_reviews_df['num_questions_marks'].mean())
print("Median number of questions marks:", threestar_reviews_df['num_questions_marks'].median())

In [None]:
# analyzing Four Star reviews
fourstar_reviews_df = reviews_df[reviews_df['star_rating'] == 4]
fourstar_reviews_df.info()

In [None]:
# Stats on Word, exclamation & question marks Count for 4 star reviews
print("Stats on word, exclamation & question marks count for 4 Star Reviews")
print("-----------------------------------------")
print("Min number of words:", fourstar_reviews_df['num_of_words'].min())
print("Max number of words:", fourstar_reviews_df['num_of_words'].max())
print("Average number of words:", fourstar_reviews_df['num_of_words'].mean())
print("Median number of words:", fourstar_reviews_df['num_of_words'].median())
print("Max number of exclamation marks:", fourstar_reviews_df['num_exclamations_marks'].max())
print("Average number of exclamation marks:", fourstar_reviews_df['num_exclamations_marks'].mean())
print("Median number of exclamation marks:", fourstar_reviews_df['num_exclamations_marks'].median())
print("Max number of question marks:", fourstar_reviews_df['num_questions_marks'].max())
print("Average number of question marks:", fourstar_reviews_df['num_questions_marks'].mean())
print("Median number of question marks:", fourstar_reviews_df['num_questions_marks'].median())

In [None]:
# analyzing Five Star reviews
fivestar_reviews_df = reviews_df[reviews_df['star_rating'] == 5]
fivestar_reviews_df.info()

In [None]:
# Stats on Word, exclamation & question marks Count for five star reviews
print("Stats on word, exclamation & question marks count for 5 Star Reviews")
print("-----------------------------------------")
print("Min number of words:", fivestar_reviews_df['num_of_words'].min())
print("Max number of words:", fivestar_reviews_df['num_of_words'].max())
print("Average number of words:", fivestar_reviews_df['num_of_words'].mean())
print("Median number of words:", fivestar_reviews_df['num_of_words'].median())
print("Max number of exclamation marks:", fivestar_reviews_df['num_exclamations_marks'].max())
print("Average number of exclamation marks:", fivestar_reviews_df['num_exclamations_marks'].mean())
print("Median number of exclamation marks:", fivestar_reviews_df['num_exclamations_marks'].median())
print("Max number of question marks:", fivestar_reviews_df['num_questions_marks'].max())
print("Average number of question marks:", fivestar_reviews_df['num_questions_marks'].mean())
print("Median number of question marks:", fivestar_reviews_df['num_questions_marks'].median())

In [None]:
# all reviews with 5 or less words
five_less_word_reviews_df = reviews_df[reviews_df['num_of_words'] <= 5]
five_less_word_reviews_df.info()

In [None]:
# analizing reviews with less or equal to 5 words
five_less_word_reviews_df.groupby(['star_rating']).count()

In [None]:
# all reviews with more than 5 words
mt_five_word_reviews_df = reviews_df[reviews_df['num_of_words'] > 5]
mt_five_word_reviews_df.head()

In [None]:
# Stats for reviews with more than 5 words
print("Min number of words:", mt_five_word_reviews_df['num_of_words'].min())
print("Max number of words:", mt_five_word_reviews_df['num_of_words'].max())
print("Average number of words:", mt_five_word_reviews_df['num_of_words'].mean())
print("Median number of words:", mt_five_word_reviews_df['num_of_words'].median())
print("Max number of exclamation marks:", mt_five_word_reviews_df['num_exclamations_marks'].max())
print("Average number of exclamation marks:", mt_five_word_reviews_df['num_exclamations_marks'].mean())
print("Median number of exclamation marks:", mt_five_word_reviews_df['num_exclamations_marks'].median())
print("Max number of question marks:", mt_five_word_reviews_df['num_questions_marks'].max())
print("Average number of question marks:", mt_five_word_reviews_df['num_questions_marks'].mean())
print("Median number of question marks:", mt_five_word_reviews_df['num_questions_marks'].median())

In [None]:
# More than 5 word reviews by Star Rating
grouped_review_df = mt_five_word_reviews_df.groupby(['star_rating']).count()
grouped_review_df.head()

In [None]:
grouped_review_df.plot.bar(y='num_of_words', title='Reviews with more than 5 words.')

In [None]:
grouped_review_df.plot.bar(y='num_exclamations_marks', title='Exclamation marks')

In [None]:
#mt_five_word_reviews_df.tail()
mtfive_1star_reviews_df = mt_five_word_reviews_df[mt_five_word_reviews_df['star_rating'] == 1].reset_index(drop=True)
mtfive_1star_reviews_df.head()

In [None]:
onestar_word_list = []
#onestar_words = []
for index, row in mtfive_1star_reviews_df.iterrows():
        onestar_word_list.extend(remove_special_chars(mtfive_1star_reviews_df.iloc[index]['review_body']))
            
print(onestar_word_list)

In [None]:
print(mtfive_1star_reviews_df['num_of_words'].sum())
print(len(onestar_word_list))

In [None]:
onestar_uniq_words = {}
onestar_uniq_words = get_uniq_words(onestar_word_list)
print(onestar_uniq_words)
print(len(onestar_uniq_words))

In [None]:
# sorting dictionary by values in descending order
onestar_uniq_sorted = sorted(onestar_uniq_words.items(), key=lambda x: x[1], reverse=True)


In [None]:
# printing first 200 items in dictionary
print(onestar_uniq_sorted[:30])

In [None]:
onestar_stopwords = {"the","i","a","and","to","amazon","of","this","for","is","was",
                    "in","that","on","my","you","they","have","so","as","are","at","be",
                    ""}
print(len(onestar_stopwords))

In [None]:
ten_words_df = reviews_df[reviews_df['num_of_words'] <= 10]
ten_words_df.info()

In [None]:
large_words_df = reviews_df[reviews_df['num_of_words'] >= 2000]
large_words_df.info()

In [None]:
y=reviews_df['star_rating']
x=reviews_df['num_of_words']


In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot(x,y)

Test area below

In [None]:
print(count_words("This is not such a long sentence ya ?"))

In [None]:
print(reviews_df.iloc[19912])
print(reviews_df.loc[19912, 'review_body'])

In [None]:
## testing re functionality 
words = re.findall(['], "This isn't a crazy 22 test string . What exactly is it?")
print(words)
print(len(words))

In [None]:
ques="?"
excla="!"
test_str = "Wow! that! is totally awesome!! No?"
print(test_str.count("?"))
print(test_str.count("!"))

In [None]:
test_a = "This is a small review. Yes ^$ or no? I don't really know. Amazon !!"
#print(remove_special_chars(test_a))
print(mt_five_word_reviews_df.iloc[100]['review_body'])
print(remove_special_chars(mt_five_word_reviews_df.iloc[100]['review_body']))

In [None]:
list_words = []
print(mt_five_word_reviews_df.iloc[10777]['review_body'])
list_words = remove_special_chars(mt_five_word_reviews_df.iloc[10777]['review_body'])
#list_words = mt_five_word_reviews_df.iloc[10777]['review_body']
print("\n\n")
#list_words = list_words.replace("<br />"," ")
#print(list_words)

print(get_uniq_words(list_words))