In [1]:
# In this notebook:
# Clean the summary and review text fields of all game reviews
# Save to csv for easy access

In [25]:
import numpy as np
import pandas as pd
import json

In [11]:
pd.set_option('display.max_colwidth', 100)

In [4]:
# works for full version, however load time is slow
# method would be problematic if too many subsets...
# i.e. list in column helpful

gorig = pd.read_json('Video_Games_5.json', lines=True)

print(f'Dataset shape is {gorig.shape}')

gorig.head()

Dataset shape is (231780, 9)


Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,700099867,"[8, 12]",1,Installing the game was a struggle (because of...,"07 9, 2012",A2HD75EMZR8QLN,123,Pay to unlock content? I don't think so.,1341792000
1,700099867,"[0, 0]",4,If you like rally cars get this game you will ...,"06 30, 2013",A3UR8NLLY1ZHCX,"Alejandro Henao ""Electronic Junky""",Good rally game,1372550400
2,700099867,"[0, 0]",1,1st shipment received a book instead of the ga...,"06 28, 2014",A1INA0F5CWW3J4,"Amazon Shopper ""Mr.Repsol""",Wrong key,1403913600
3,700099867,"[7, 10]",3,"I got this version instead of the PS3 version,...","09 14, 2011",A1DLMTOTHQ4AST,ampgreen,"awesome game, if it did not crash frequently !!",1315958400
4,700099867,"[2, 2]",4,I had Dirt 2 on Xbox 360 and it was an okay ga...,"06 14, 2011",A361M14PU2GUEG,"Angry Ryan ""Ryan A. Forrest""",DIRT 3,1308009600


In [6]:
print(f"overall has unique values of: {gorig['overall'].unique()}")

overall has unique values of: [1 4 3 5 2]


## Preprocess text

In [8]:
from nltk.corpus import stopwords

In [9]:
gorig['cleansum'] = gorig['summary']
gorig['cleantxt'] = gorig['reviewText']

gorig.head(2)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,cleansum,cleantxt
0,700099867,"[8, 12]",1,Installing the game was a struggle (because of...,"07 9, 2012",A2HD75EMZR8QLN,123,Pay to unlock content? I don't think so.,1341792000,Pay to unlock content? I don't think so.,Installing the game was a struggle (because of...
1,700099867,"[0, 0]",4,If you like rally cars get this game you will ...,"06 30, 2013",A3UR8NLLY1ZHCX,"Alejandro Henao ""Electronic Junky""",Good rally game,1372550400,Good rally game,If you like rally cars get this game you will ...


In [10]:
gorig['cleansum'] = gorig['cleansum'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
gorig['cleantxt'] = gorig['cleantxt'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))

gorig[['cleansum', 'cleantxt']].head(2)

Unnamed: 0,cleansum,cleantxt
0,pay to unlock content? i don't think so.,installing the game was a struggle (because of...
1,good rally game,if you like rally cars get this game you will ...


In [13]:
def remove_sym(x):
    new_str = ''
    for c in x:
        if (c.isalnum() == False) & (c != ' '):
            pass
        else:
            new_str += c
    return new_str

In [14]:
gorig['cleansum'] = gorig['cleansum'].apply(remove_sym)
gorig['cleantxt'] = gorig['cleantxt'].apply(remove_sym)

gorig[['cleansum', 'cleantxt']].head(2)

Unnamed: 0,cleansum,cleantxt
0,pay to unlock content i dont think so,installing the game was a struggle because of games for windows live bugssome championship races...
1,good rally game,if you like rally cars get this game you will have funit is more oriented to 34european market34...


In [22]:
def remove_num(x):
    new_str = ''
    for c in x:
        if (c.isnumeric() == True):
            pass
        else:
            new_str += c
    return new_str

In [23]:
gorig['cleansum'] = gorig['cleansum'].apply(remove_num)
gorig['cleantxt'] = gorig['cleantxt'].apply(remove_num)

gorig[['cleansum', 'cleantxt']].head(2)

Unnamed: 0,cleansum,cleantxt
0,pay to unlock content i dont think so,installing the game was a struggle because of games for windows live bugssome championship races...
1,good rally game,if you like rally cars get this game you will have funit is more oriented to european market sin...


In [24]:
stop = stopwords.words('english')

gorig['cleansum'] = gorig['cleansum'].apply(lambda x: " ".join(x for x in str(x).split() if x not in stop))
gorig['cleantxt'] = gorig['cleantxt'].apply(lambda x: " ".join(x for x in str(x).split() if x not in stop))

gorig[['cleansum', 'cleantxt']].head(2)

Unnamed: 0,cleansum,cleantxt
0,pay unlock content dont think,installing game struggle games windows live bugssome championship races cars unlocked buying add...
1,good rally game,like rally cars get game funit oriented european market since america isnt huge rally fan party ...


In [26]:
sum_words = pd.Series(' '.join(gorig['cleansum']).split()).value_counts()
txt_words = pd.Series(' '.join(gorig['cleantxt']).split()).value_counts()

print(f'Summary unique words: {len(sum_words)}')
print(f'Review text unique words: {len(txt_words)}')

Summary unique words: 32172
Review text unique words: 541498


In [29]:
sum_rare = sum_words[(sum_words <= 3)]
txt_rare = txt_words[(txt_words <= 4)]

print(f'Rare summary word count: {len(sum_rare)}')
print(f'Rare review text word count: {len(txt_rare)}')

Rare summary word count: 23554
Rare review text word count: 471180


In [30]:
gorig['cleansum'] = gorig['cleansum'].apply(lambda x: " ".join(x for x in str(x).split() if x not in sum_rare))
gorig['cleantxt'] = gorig['cleantxt'].apply(lambda x: " ".join(x for x in str(x).split() if x not in txt_rare))

gorig[['cleansum', 'cleantxt']].head(5)

Unnamed: 0,cleansum,cleantxt
0,pay unlock content dont think,installing game struggle games windows live championship races cars unlocked buying addon game p...
1,good rally game,like rally cars get game funit oriented european market since america isnt huge rally fan party ...
2,wrong key,st shipment received book instead shipment got fake one game arrived wrong key inside sealed box...
3,awesome game crash frequently,got version instead ps version turned mistake console versions games look percent good pc versio...
4,dirt,dirt xbox okay game started playing games laptop bought new games build collection game fun play...


In [40]:
try:
    gorig.to_csv('amazon_games_all.csv',mode='x')
    print('File saved')
    
except:
    print('File exists')

File saved


In [47]:
gclean = gorig[['asin','helpful','overall','cleansum','cleantxt']]

gclean.head()

Unnamed: 0,asin,helpful,overall,cleansum,cleantxt
0,700099867,"[8, 12]",1,pay unlock content dont think,installing game struggle games windows live championship races cars unlocked buying addon game p...
1,700099867,"[0, 0]",4,good rally game,like rally cars get game funit oriented european market since america isnt huge rally fan party ...
2,700099867,"[0, 0]",1,wrong key,st shipment received book instead shipment got fake one game arrived wrong key inside sealed box...
3,700099867,"[7, 10]",3,awesome game crash frequently,got version instead ps version turned mistake console versions games look percent good pc versio...
4,700099867,"[2, 2]",4,dirt,dirt xbox okay game started playing games laptop bought new games build collection game fun play...


In [71]:
def ret_first(x):
    return x[0]

def ret_second(x):
    return x[1]

In [73]:
gclean['h_first'] = gclean['helpful'].apply(ret_first)
gclean['h_second'] = gclean['helpful'].apply(ret_second)

gclean.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,asin,helpful,overall,cleansum,cleantxt,h_first,h_second
0,700099867,"[8, 12]",1,pay unlock content dont think,installing game struggle games windows live championship races cars unlocked buying addon game p...,8,12
1,700099867,"[0, 0]",4,good rally game,like rally cars get game funit oriented european market since america isnt huge rally fan party ...,0,0
2,700099867,"[0, 0]",1,wrong key,st shipment received book instead shipment got fake one game arrived wrong key inside sealed box...,0,0
3,700099867,"[7, 10]",3,awesome game crash frequently,got version instead ps version turned mistake console versions games look percent good pc versio...,7,10
4,700099867,"[2, 2]",4,dirt,dirt xbox okay game started playing games laptop bought new games build collection game fun play...,2,2


In [76]:
gclean = gclean[gclean['h_second'] != 0]

print(f'Removing 0 ratings, gclean shape is {gclean.shape}')

gclean.head(2)

Removing 0 ratings, gclean shape is (139855, 7)


Unnamed: 0,asin,helpful,overall,cleansum,cleantxt,h_first,h_second
0,700099867,"[8, 12]",1,pay unlock content dont think,installing game struggle games windows live championship races cars unlocked buying addon game p...,8,12
3,700099867,"[7, 10]",3,awesome game crash frequently,got version instead ps version turned mistake console versions games look percent good pc versio...,7,10


In [79]:
gclean['helppercent'] = gclean['h_first'] / gclean['h_second']

gclean.head(5)

Unnamed: 0,asin,helpful,overall,cleansum,cleantxt,h_first,h_second,helppercent
0,700099867,"[8, 12]",1,pay unlock content dont think,installing game struggle games windows live championship races cars unlocked buying addon game p...,8,12,0.666667
3,700099867,"[7, 10]",3,awesome game crash frequently,got version instead ps version turned mistake console versions games look percent good pc versio...,7,10,0.7
4,700099867,"[2, 2]",4,dirt,dirt xbox okay game started playing games laptop bought new games build collection game fun play...,2,2,1.0
6,700099867,"[11, 13]",5,step dirt terrific,loved playing dirt thought graphics good purchased dirt addition otherand graphics absolutely go...,11,13,0.846154
7,700099867,"[1, 4]",1,crash correct name aka microsoft,cant tell piece dog game like everything else microsoft makes doesnt work going take cue apple m...,1,4,0.25


In [80]:
gclean = gclean[gclean['helppercent'] >= 0.50]

print(f'With helful rating >= 50%, gclean shape is {gclean.shape}')

gclean.head(5)

With helful rating >= 50%, gclean shape is (98144, 8)


Unnamed: 0,asin,helpful,overall,cleansum,cleantxt,h_first,h_second,helppercent
0,700099867,"[8, 12]",1,pay unlock content dont think,installing game struggle games windows live championship races cars unlocked buying addon game p...,8,12,0.666667
3,700099867,"[7, 10]",3,awesome game crash frequently,got version instead ps version turned mistake console versions games look percent good pc versio...,7,10,0.7
4,700099867,"[2, 2]",4,dirt,dirt xbox okay game started playing games laptop bought new games build collection game fun play...,2,2,1.0
6,700099867,"[11, 13]",5,step dirt terrific,loved playing dirt thought graphics good purchased dirt addition otherand graphics absolutely go...,11,13,0.846154
9,700099867,"[1, 1]",2,couldnt get one work,still havent figured one everything instructed game never installed strange since dont like rate...,1,1,1.0


In [83]:
gclean.pop('helpful')
gclean.pop('h_first')
gclean.pop('h_second')

gclean.head()

Unnamed: 0,asin,overall,cleansum,cleantxt,helppercent
0,700099867,1,pay unlock content dont think,installing game struggle games windows live championship races cars unlocked buying addon game p...,0.666667
3,700099867,3,awesome game crash frequently,got version instead ps version turned mistake console versions games look percent good pc versio...,0.7
4,700099867,4,dirt,dirt xbox okay game started playing games laptop bought new games build collection game fun play...,1.0
6,700099867,5,step dirt terrific,loved playing dirt thought graphics good purchased dirt addition otherand graphics absolutely go...,0.846154
9,700099867,2,couldnt get one work,still havent figured one everything instructed game never installed strange since dont like rate...,1.0


In [85]:
gclean['cleanboth'] = gclean['cleansum'] + ' ' + gclean['cleantxt']

gclean.head(2)

Unnamed: 0,asin,overall,cleansum,cleantxt,helppercent,cleanboth
0,700099867,1,pay unlock content dont think,installing game struggle games windows live championship races cars unlocked buying addon game p...,0.666667,pay unlock content dont think installing game struggle games windows live championship races car...
3,700099867,3,awesome game crash frequently,got version instead ps version turned mistake console versions games look percent good pc versio...,0.7,awesome game crash frequently got version instead ps version turned mistake console versions gam...


In [88]:
gclean = gclean[['asin', 'helppercent', 'overall', 'cleansum', 'cleantxt', 'cleanboth']]

gclean.head(2)

Unnamed: 0,asin,helppercent,overall,cleansum,cleantxt,cleanboth
0,700099867,0.666667,1,pay unlock content dont think,installing game struggle games windows live championship races cars unlocked buying addon game p...,pay unlock content dont think installing game struggle games windows live championship races car...
3,700099867,0.7,3,awesome game crash frequently,got version instead ps version turned mistake console versions games look percent good pc versio...,awesome game crash frequently got version instead ps version turned mistake console versions gam...


In [90]:
gclean.reset_index(inplace=True)

gclean.head(2)

Unnamed: 0,index,asin,helppercent,overall,cleansum,cleantxt,cleanboth
0,0,700099867,0.666667,1,pay unlock content dont think,installing game struggle games windows live championship races cars unlocked buying addon game p...,pay unlock content dont think installing game struggle games windows live championship races car...
1,3,700099867,0.7,3,awesome game crash frequently,got version instead ps version turned mistake console versions games look percent good pc versio...,awesome game crash frequently got version instead ps version turned mistake console versions gam...


In [93]:
try:
    gclean.to_csv('amazon_games_clean.csv',mode='x')
    print('File saved')
    
except:
    print('File exists')

File saved


In [None]:
# this concludes pt 2