# Data Science Project

### Data Cleaning

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import pickle

In [2]:
# df = pd.read_csv('amazon_review_full_csv/train.csv', header=None)
df = pd.read_csv('amazon_review_full_csv/train_small.csv', header=None)
df_test = pd.read_csv('amazon_review_full_csv/test_small.csv', header=None)

In [3]:
df_test.columns = ['rating', 'title', 'review']

In [4]:
df.columns = ['rating', 'title', 'review']
df["title"] = df["title"].astype(str)
df["review"] = df["review"].astype(str)
df.head()

Unnamed: 0,rating,title,review
0,3,more like funchuck,Gave this to my dad for a gag gift after direc...
1,5,Inspiring,I hope a lot of people hear this cd. We need m...
2,5,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
3,4,Chrono Cross OST,The music of Yasunori Misuda is without questi...
4,5,Too good to be true,Probably the greatest soundtrack in history! U...


In [5]:
corpus = pd.DataFrame(columns = ['rating', 'title', 'review'])
corpus = df[df["rating"]!=3] #if without rating -3
corpus.head()

Unnamed: 0,rating,title,review
1,5,Inspiring,I hope a lot of people hear this cd. We need m...
2,5,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
3,4,Chrono Cross OST,The music of Yasunori Misuda is without questi...
4,5,Too good to be true,Probably the greatest soundtrack in history! U...
5,5,There's a reason for the price,"There's a reason this CD is so expensive, even..."


In [6]:
import re
import string

stop_words =nltk.corpus.stopwords.words('english')

def clean_text_2(text):
    text = text.lower() #make all text lowercase
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) #delete puntctation 
    text = re.sub('\w*\d\w*', '', text) #delete expressions with numbers
    return text
    
f_clean_2 = lambda x:clean_text_2(x)

clean2_rev =pd.Series(corpus.review.apply(f_clean_2))
clean2_tit =pd.Series(corpus.title.apply(f_clean_2))
df_clean2 = pd.concat([corpus.iloc[:,0]], axis =1)
df_clean2["text"] = clean2_tit +" "+clean2_rev
df_clean2.head(10)

Unnamed: 0,rating,text
1,5,inspiring i hope a lot of people hear this cd ...
2,5,the best soundtrack ever to anything im readin...
3,4,chrono cross ost the music of yasunori misuda ...
4,5,too good to be true probably the greatest soun...
5,5,theres a reason for the price theres a reason ...
6,1,buyer beware this is a selfpublished book and ...
7,4,errors but great story i was a dissapointed to...
8,1,the worst a complete waste of time typographic...
9,1,oh please i guess you have to be a romance nov...
10,1,awful beyond belief i feel i have to write to ...


In [7]:
test_rev =pd.Series(df_test.review.apply(f_clean_2))
test_tit =pd.Series(df_test.title.apply(f_clean_2))
test_clean = pd.concat([df_test.iloc[:,0]], axis =1)
test_clean["text"] = test_tit +" "+test_rev
test_clean.head(10)

Unnamed: 0,rating,text
0,1,mens ultrasheer this model may be ok for seden...
1,4,surprisingly delightful this is a fast read fi...
2,2,works but not as advertised i bought one of th...
3,2,oh dear i was excited to find a book ostensibl...
4,2,incorrect disc i am a big jvc fan but i do not...
5,2,incorrect disc i love the style of this but af...
6,2,dvd menu select problems i cannot scroll throu...
7,3,my yo grandson loves it this movie with all o...
8,5,a cookbook every baker should own i found a co...
9,3,good basic the book is a basic how to book for...


In [8]:
rows = []
for rating in df_clean2["rating"].unique():
    rows.append(df_clean2[df_clean2["rating"]==rating]["text"].str.cat(sep=" "))
data_clean = pd.DataFrame()
data_clean["rating"] = df_clean2["rating"].unique() 
data_clean["text"] = rows
data_clean.head()

Unnamed: 0,rating,text
0,5,inspiring i hope a lot of people hear this cd ...
1,4,chrono cross ost the music of yasunori misuda ...
2,1,buyer beware this is a selfpublished book and ...
3,2,sizes recomended in the size chart are not rea...


In [9]:
stop_words =nltk.corpus.stopwords.words('english')
data_clean_stop = pd.DataFrame()
data_clean_stop["rating"] = data_clean["rating"].unique()
rows = []
for index in range(0, len(data_clean["rating"])):
    rows.append(" ".join([word for word in data_clean.iloc[index,1].split() if word not in stop_words]))
data_clean_stop["text"] = rows
data_clean_stop.head()

Unnamed: 0,rating,text
0,5,inspiring hope lot people hear cd need strong ...
1,4,chrono cross ost music yasunori misuda without...
2,1,buyer beware selfpublished book want know whyr...
3,2,sizes recomended size chart real sizes much sm...


In [10]:
test = pd.DataFrame()
test["rating"] = test_clean["rating"]
rows = []
for index in range(0, len(test_clean["rating"])):
    rows.append(" ".join([word for word in test_clean.iloc[index,1].split() if word not in stop_words]))
test["text"] = rows
test.head()

Unnamed: 0,rating,text
0,1,mens ultrasheer model may ok sedentary types i...
1,4,surprisingly delightful fast read filled unexp...
2,2,works advertised bought one chargersthe instru...
3,2,oh dear excited find book ostensibly muslim fe...
4,2,incorrect disc big jvc fan like model suspisci...


In [11]:
test_3 = test[test["rating"]!=3]

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
data_cv = cv.fit_transform(data_clean_stop.text)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean_stop.rating
data_dtm


Unnamed: 0_level_0,aa,aaa,aaaa,aaaaa,aaaaaa,aaaaaaa,aaaaaaaa,aaaaaaaaaa,aaaaaaaaaaaaaaaaaaa,aaaaaaaaaaaaaaaaaaaaaaaaayyyyyyyyyyyyyyyiiiiiiiiiiiiiiiaaaaaaaaaahhhhhh,...,überhot,überhottie,überleet,übermensch,übermoms,übernothing,überproducer,überraschenden,überrecommended,üs
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,164,87,8,6,1,1,1,0,1,1,...,0,0,1,0,0,0,0,1,1,0
4,210,109,1,1,0,1,0,0,0,0,...,2,1,0,1,1,0,1,0,0,1
1,166,75,3,3,0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,195,98,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
data_dtm.to_pickle("dtm_text_1.pkl")
data_clean_stop.to_pickle("data_clean_1.pkl")
test_3.to_pickle("test_without_3.pkl")
test.to_pickle("test_1.pkl")

In [14]:
# # Prepare nltk objects
# # stop_words = set(stopwords.words("english"))
# stop_words =nltk.corpus.stopwords.words('english')

In [15]:
# changed list of stop words based on the exploratory dat analysis:
# extra_stop_words = ['book','movie', 'one', 'like']
# stop_words.extend(extra_stop_words)

In [16]:
# for index, review in df_clean2.text.iteritems():
#     word_tokens = word_tokenize(review)
#     print(word_tokens)
#     filtered_review = [word for word in word_tokens if not word in stop_words]
#     filtered_review.copy())
    
# for i in range(0, df_clean2.text.size):
#     word_tokens = word_tokenize(df_clean2.text[i])
#     filtered_sentence = [word for word in word_tokens if not word in stop_words]
#     filtered_sentence = []
#     for word in word_tokens:
#         if word not in stop_words:
#             filtered_sentence.append(word)
#     df_clean2.title[i] = filtered_sentence#.copy()
# df_clean2.head()

In [17]:
# for i in range(0, df_clean2.review.size):
#     word_tokens = word_tokenize(df_clean2.review[i])
#     filtered_sentence = [word for word in word_tokens if not word in stop_words]
#     filtered_sentence = []
#     for word in word_tokens:
#         if word not in stop_words:
#             filtered_sentence.append(word)
#     df_clean2.review[i] = filtered_sentence.copy()

In [18]:
# # Now create Document-Term Matrix: Step1
# df_clean3 = pd.DataFrame({'rating':[-2,-1,1,2], 'title': [[],[],[],[]], 'review':[[],[],[],[]], 'text':[[],[],[],[]]})
# df_clean3 = df_clean3.set_index('rating')
# for index, row in df_clean2.iterrows():
#     for val in row.title:
#         df_clean3.loc[row.rating].title.append(val)
#         df_clean3.loc[row.rating].text.append(val)
#     for val in row.review:
#         df_clean3.loc[row.rating].review.append(val)
#         df_clean3.loc[row.rating].text.append(val)
# df_clean3.head()

In [19]:
# # change data set to dtm
# def to_dtm(data, col):
#     for rat in df_clean3.index:
#         for val in df_clean3.loc[rat, col]:
#             if val in data.columns:
#                 data.loc[rat, val] = data.loc[rat, val]+1
#             else:
#                 data[val] = [0,0,0,0]
#                 data.loc[rat, val] = data.loc[rat, val]+1

In [20]:
# #DTM Step2 - Title
# dtm_title = pd.DataFrame({'rating':[-2,-1,1,2]})
# dtm_title = dtm_title.set_index('rating')
# to_dtm(dtm_title, "title")
# dtm_title.head()

In [21]:
# #DTM Step2 - Review
# dtm_review = pd.DataFrame({'rating':[-2,-1,1,2]})
# dtm_review = dtm_review.set_index('rating')
# to_dtm(dtm_review, "review")
# dtm_review.head()

In [22]:
# #DTM Step2 - both
# dtm_text = pd.DataFrame({'rating':[-2,-1,1,2]})
# dtm_text = dtm_text.set_index('rating')
# to_dtm(dtm_text, "text")
# dtm_text.head()

In [23]:
# dtm_text.to_pickle("dtm_text.pkl")
# dtm_title.to_pickle("dtm_title.pkl")
# dtm_review.to_pickle("dtm_review.pkl")
# corpus.to_pickle("corpus.pkl")
# df_clean2.to_pickle("data_clean.pkl")