In [56]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import gzip

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize 
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string

import dask.dataframe as dd
from dask.multiprocessing import get

import swifter


In [3]:
meta_data = pd.read_csv('meta_new.csv')

In [5]:
meta_data.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
meta_data.head()

Unnamed: 0,asin,imUrl,description,categories,title,price,salesRank,related,brand,stem_words
0,0528881469,http://ecx.images-amazon.com/images/I/51FnRkJq...,"Like its award-winning predecessor, the Intell...","[['Electronics', 'GPS & Navigation', 'Vehicle ...",Rand McNally 528881469 7-inch Intelliroute TND...,299.99,,"{'also_viewed': ['B006ZOI9OY', 'B00C7FKT2A', '...",,"['predecessor', 'intellirout', 'devic', 'navig..."
1,0972683275,http://ecx.images-amazon.com/images/I/41hYJ9Mw...,The VideoSecu TV mount is a mounting solution ...,"[['Electronics', 'Accessories & Supplies', 'Au...",VideoSecu 24&quot; Long Arm TV Wall Mount Low ...,29.99,{},"{'also_bought': ['B000X3KOD2', 'B0074FGR74', '...",VideoSecu,"['tv', 'mount', 'solut', 'lcd', 'tv', 'mount',..."
2,1400532620,http://ecx.images-amazon.com/images/I/519ca3cu...,Barnes & Noble Nook eReader - no 3GMeet nook. ...,"[['Electronics', 'eBook Readers & Accessories']]",Barnes &amp; Noble Nook eReader - no 3G,74.95,{'Electronics': 23071},"{'also_bought': ['B0035CLBT4', 'B004X18N24', '...",Barnes &amp; Noble,"['eread', 'nook', 'access', 'wifi', 'brows', '..."
3,140053271X,http://ecx.images-amazon.com/images/I/51jat7CV...,Barnes & Noble Nook Simple Touch Wi-Fi ReaderI...,"[['Electronics', 'eBook Readers & Accessories'...",Barnes &amp; Noble Nook Simple Touch eBook Rea...,79.49,,"{'also_bought': ['B007UXNHNM', 'B007UXNHGY', '...",Barnes &amp; Noble,"['wifi', 'touch', 'batteri', 'life', 'ink', 'd..."
4,1400532736,http://ecx.images-amazon.com/images/I/413fSdlM...,The NOOK Simple Touch eReader allows you to re...,"[['Electronics', 'eBook Readers & Accessories'...",Nook Simple Touch eReader,62.99,{'Electronics': 4945},"{'also_bought': ['B0055ZDRI2', 'B007UXNHGY', '...",Barnes &amp; Noble,"['eread', 'display', 'devic', 'conveni', 'read..."


In [7]:
len(meta_data)

26762

In [18]:
# get the stemmed or lemmatized words of product description 
#for word_type: use 'stem' for stemming and 'lemmatize' for lemmatization
def get_desc_words(text):
    word_type='lemmatize'
    nopunc= [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)

    words = [word.lower() for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    
    tagged = nltk.pos_tag(words) 
    
    nouns = []
    for word in tagged:
        if word[1]=='NN':
            nouns.append(word[0])

    if word_type=='stem' :
        ps = PorterStemmer()
        stemmed_words = []
        for w in nouns:
            stemmed_words.append(ps.stem(w))
        
        return stemmed_words
    
    if word_type=='lemmatize':
        lem = WordNetLemmatizer()
        lem_words = []
        for w in nouns:
            lem_words.append(lem.lemmatize(w))
        
        return lem_words


In [12]:
ddata = dd.from_pandas(meta_data, npartitions=5)

In [13]:
ddata.head()

Unnamed: 0,asin,imUrl,description,categories,title,price,salesRank,related,brand,stem_words
0,0528881469,http://ecx.images-amazon.com/images/I/51FnRkJq...,"Like its award-winning predecessor, the Intell...","[['Electronics', 'GPS & Navigation', 'Vehicle ...",Rand McNally 528881469 7-inch Intelliroute TND...,299.99,,"{'also_viewed': ['B006ZOI9OY', 'B00C7FKT2A', '...",,"['predecessor', 'intellirout', 'devic', 'navig..."
1,0972683275,http://ecx.images-amazon.com/images/I/41hYJ9Mw...,The VideoSecu TV mount is a mounting solution ...,"[['Electronics', 'Accessories & Supplies', 'Au...",VideoSecu 24&quot; Long Arm TV Wall Mount Low ...,29.99,{},"{'also_bought': ['B000X3KOD2', 'B0074FGR74', '...",VideoSecu,"['tv', 'mount', 'solut', 'lcd', 'tv', 'mount',..."
2,1400532620,http://ecx.images-amazon.com/images/I/519ca3cu...,Barnes & Noble Nook eReader - no 3GMeet nook. ...,"[['Electronics', 'eBook Readers & Accessories']]",Barnes &amp; Noble Nook eReader - no 3G,74.95,{'Electronics': 23071},"{'also_bought': ['B0035CLBT4', 'B004X18N24', '...",Barnes &amp; Noble,"['eread', 'nook', 'access', 'wifi', 'brows', '..."
3,140053271X,http://ecx.images-amazon.com/images/I/51jat7CV...,Barnes & Noble Nook Simple Touch Wi-Fi ReaderI...,"[['Electronics', 'eBook Readers & Accessories'...",Barnes &amp; Noble Nook Simple Touch eBook Rea...,79.49,,"{'also_bought': ['B007UXNHNM', 'B007UXNHGY', '...",Barnes &amp; Noble,"['wifi', 'touch', 'batteri', 'life', 'ink', 'd..."
4,1400532736,http://ecx.images-amazon.com/images/I/413fSdlM...,The NOOK Simple Touch eReader allows you to re...,"[['Electronics', 'eBook Readers & Accessories'...",Nook Simple Touch eReader,62.99,{'Electronics': 4945},"{'also_bought': ['B0055ZDRI2', 'B007UXNHGY', '...",Barnes &amp; Noble,"['eread', 'display', 'devic', 'conveni', 'read..."


In [53]:
#meta_data['lem_words'] = ddata.map_partitions(lambda df: df.apply(lambda row: get_desc_words(row['description']), axis=1)).compute(scheduler=get)

In [57]:
## Using Swifter for applying the function
meta_data['lem_words'] = meta_data['description'].swifter.apply(get_desc_words)

Pandas Apply: 100%|██████████████████████████████████████████████████████████████| 26762/26762 [37:09<00:00, 12.00it/s]


In [70]:
meta_data.head()

Unnamed: 0,asin,imUrl,description,categories,title,price,salesRank,related,brand,stem_words,lem_words
0,0528881469,http://ecx.images-amazon.com/images/I/51FnRkJq...,"Like its award-winning predecessor, the Intell...","[['Electronics', 'GPS & Navigation', 'Vehicle ...",Rand McNally 528881469 7-inch Intelliroute TND...,299.99,,"{'also_viewed': ['B006ZOI9OY', 'B00C7FKT2A', '...",,"['predecessor', 'intellirout', 'devic', 'navig...","[predecessor, intelliroute, device, navigation..."
1,0972683275,http://ecx.images-amazon.com/images/I/41hYJ9Mw...,The VideoSecu TV mount is a mounting solution ...,"[['Electronics', 'Accessories & Supplies', 'Au...",VideoSecu 24&quot; Long Arm TV Wall Mount Low ...,29.99,{},"{'also_bought': ['B000X3KOD2', 'B0074FGR74', '...",VideoSecu,"['tv', 'mount', 'solut', 'lcd', 'tv', 'mount',...","[tv, mount, solution, lcd, tv, mounting, hole,..."
2,1400532620,http://ecx.images-amazon.com/images/I/519ca3cu...,Barnes & Noble Nook eReader - no 3GMeet nook. ...,"[['Electronics', 'eBook Readers & Accessories']]",Barnes &amp; Noble Nook eReader - no 3G,74.95,{'Electronics': 23071},"{'also_bought': ['B0035CLBT4', 'B004X18N24', '...",Barnes &amp; Noble,"['eread', 'nook', 'access', 'wifi', 'brows', '...","[ereader, nook, access, wifi, browse, ebook, c..."
3,140053271X,http://ecx.images-amazon.com/images/I/51jat7CV...,Barnes & Noble Nook Simple Touch Wi-Fi ReaderI...,"[['Electronics', 'eBook Readers & Accessories'...",Barnes &amp; Noble Nook Simple Touch eBook Rea...,79.49,,"{'also_bought': ['B007UXNHNM', 'B007UXNHGY', '...",Barnes &amp; Noble,"['wifi', 'touch', 'batteri', 'life', 'ink', 'd...","[wifi, touch, battery, life, ink, display, w, ..."
4,1400532736,http://ecx.images-amazon.com/images/I/413fSdlM...,The NOOK Simple Touch eReader allows you to re...,"[['Electronics', 'eBook Readers & Accessories'...",Nook Simple Touch eReader,62.99,{'Electronics': 4945},"{'also_bought': ['B0055ZDRI2', 'B007UXNHGY', '...",Barnes &amp; Noble,"['eread', 'display', 'devic', 'conveni', 'read...","[ereader, display, device, convenience, readin..."


In [69]:
meta_data.to_csv('meta_data_latest.csv',index=False)

In [None]:
#meta_data['lem_words'] = meta_data['description'].apply(get_desc_words)

In [None]:
4:15

In [20]:
df = pd.read_csv('Amazon_Gt_10')

In [22]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [23]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,helpfulness_score,user_deviation,review_delay
0,AMO214LNFCEI4,528881469,Amazon Customer,"[12, 15]","I'm a professional OTR truck driver, and I bou...",1.0,Very Disappointed,1290643200,"11 25, 2010",0.8,1.399189,6652800
1,A3N7T0DY83Y4IG,528881469,C. A. Freeman,"[43, 45]","Well, what can I say. I've had this unit in m...",3.0,1st impression,1283990400,"09 9, 2010",0.955556,0.624437,0
2,A1H8PY3QHMQQA0,528881469,"Dave M. Shaw ""mack dave""","[9, 10]","Not going to write a long review, even thought...",2.0,"Great grafics, POOR GPS",1290556800,"11 24, 2010",0.9,0.510375,6566400
3,A3QH8VQDE7HZCR,972683275,costaricachris,"[15, 19]",Quality was excellent. Instructions were clear...,5.0,Real value for the money,1286236800,"10 5, 2010",0.789474,0.442171,2678400
4,A38FGQVJM18OWV,972683275,"George S. Mitchell ""gsmitchell""","[8, 18]",I checked around Amazon as well as some other ...,1.0,What a piece of junk!,1291161600,"12 1, 2010",0.444444,1.616074,7603200


In [24]:
len(df)

107502

In [35]:
#get the list of pids
pids = list(meta_data['asin'])
#make function

def review_in_products(pid):
    return pid in pids

In [36]:
# Some products with Null Description values have been removed .
# Reviews corresponding to those product are removed here

df_new = df[df['asin'].apply(review_in_products) == True]

In [38]:
len(df_new)

81620

In [47]:
df_new.to_csv('Amazon_Latest_Data.csv',index=False)

In [50]:
# get the stemmed words of  review text 
def get_review_words_stem(text):
    nopunc= [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)

    words = [word.lower() for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    ps = PorterStemmer()
    stemmed_words = []
    for w in words:
        stemmed_words.append(ps.stem(w))

    return stemmed_words
    
   

In [51]:
# get the lemmatized words of  review text 
def get_review_words_lem(text):
   
    nopunc= [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)

    words = [word.lower() for word in nopunc.split() if word.lower() not in stopwords.words('english')]

    lem = WordNetLemmatizer()
    lem_words = []
    for w in words:
        lem_words.append(lem.lemmatize(w))

    return lem_words


TypeError: 'float' object is not iterable

In [65]:
df_new.dropna(subset=['reviewText'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [68]:
df_new['reviewText'].isnull().values.ravel().sum()


0

In [71]:
df_new['stem_words'] = df_new['reviewText'].swifter.apply(get_review_words_stem)

Pandas Apply: 100%|████████████████████████████████████████████████████████████| 81275/81275 [2:58:02<00:00,  5.67it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [76]:
df_new['stem_words'].head()[0]

['im',
 'profession',
 'otr',
 'truck',
 'driver',
 'bought',
 'tnd',
 '700',
 'truck',
 'stop',
 'hope',
 'make',
 'life',
 'easier',
 'rand',
 'mcnalli',
 'listeningfirst',
 'thing',
 'charg',
 'connect',
 'laptop',
 'instal',
 'softwar',
 'attempt',
 'updat',
 'softwar',
 'detect',
 'problem',
 'updat',
 'want',
 'home',
 'address',
 'could',
 'sent',
 'patch',
 'sd',
 'card',
 'hello',
 'dont',
 'think',
 'im',
 'unusu',
 'home',
 'address',
 'po',
 'box',
 'friend',
 'check',
 'weekli',
 'might',
 'get',
 'check',
 'everi',
 'six',
 'month',
 'live',
 'truck',
 'truck',
 'stop',
 'need',
 'make',
 'patch',
 'avail',
 'sd',
 'card',
 'send',
 'sd',
 'card',
 'truck',
 'stop',
 'devic',
 'sold',
 'ran',
 'updat',
 'program',
 'multipl',
 'time',
 'program',
 'said',
 'tnd',
 '700',
 'complet',
 'updatedi',
 'program',
 'height',
 '136',
 'length',
 '53',
 'weight',
 '80000',
 'rig',
 'told',
 'prefer',
 'highway',
 'park',
 'truck',
 'stop',
 'cincinnati',
 'oh',
 'area',
 'next',
 

In [None]:
df_new['lem_words']= df_new['reviewText'].swifter.apply(get_review_words_lem)

Pandas Apply: 100%|███████████████████████████████████████████████████████████| 81275/81275 [19:18:17<00:00,  4.03it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [78]:
df_new.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,helpfulness_score,user_deviation,review_delay,stem_words,lem_words
0,AMO214LNFCEI4,528881469,Amazon Customer,"[12, 15]","I'm a professional OTR truck driver, and I bou...",1.0,Very Disappointed,1290643200,"11 25, 2010",0.8,1.399189,6652800,"[im, profession, otr, truck, driver, bought, t...","[im, professional, otr, truck, driver, bought,..."
1,A3N7T0DY83Y4IG,528881469,C. A. Freeman,"[43, 45]","Well, what can I say. I've had this unit in m...",3.0,1st impression,1283990400,"09 9, 2010",0.955556,0.624437,0,"[well, say, ive, unit, truck, four, day, prior...","[well, say, ive, unit, truck, four, day, prior..."
2,A1H8PY3QHMQQA0,528881469,"Dave M. Shaw ""mack dave""","[9, 10]","Not going to write a long review, even thought...",2.0,"Great grafics, POOR GPS",1290556800,"11 24, 2010",0.9,0.510375,6566400,"[go, write, long, review, even, thought, unit,...","[going, write, long, review, even, thought, un..."
3,A3QH8VQDE7HZCR,972683275,costaricachris,"[15, 19]",Quality was excellent. Instructions were clear...,5.0,Real value for the money,1286236800,"10 5, 2010",0.789474,0.442171,2678400,"[qualiti, excel, instruct, clear, clear, tilt,...","[quality, excellent, instruction, clear, clear..."
4,A38FGQVJM18OWV,972683275,"George S. Mitchell ""gsmitchell""","[8, 18]",I checked around Amazon as well as some other ...,1.0,What a piece of junk!,1291161600,"12 1, 2010",0.444444,1.616074,7603200,"[check, around, amazon, well, site, decid, nee...","[checked, around, amazon, well, site, decided,..."


In [79]:
df_new.to_csv('Amazon_Latest.csv',index=False)

In [86]:
meta_data.head()

Unnamed: 0,asin,imUrl,description,categories,title,price,salesRank,related,brand,stem_words,lem_words
0,0528881469,http://ecx.images-amazon.com/images/I/51FnRkJq...,"Like its award-winning predecessor, the Intell...","[['Electronics', 'GPS & Navigation', 'Vehicle ...",Rand McNally 528881469 7-inch Intelliroute TND...,299.99,,"{'also_viewed': ['B006ZOI9OY', 'B00C7FKT2A', '...",,"['predecessor', 'intellirout', 'devic', 'navig...","[predecessor, intelliroute, device, navigation..."
1,0972683275,http://ecx.images-amazon.com/images/I/41hYJ9Mw...,The VideoSecu TV mount is a mounting solution ...,"[['Electronics', 'Accessories & Supplies', 'Au...",VideoSecu 24&quot; Long Arm TV Wall Mount Low ...,29.99,{},"{'also_bought': ['B000X3KOD2', 'B0074FGR74', '...",VideoSecu,"['tv', 'mount', 'solut', 'lcd', 'tv', 'mount',...","[tv, mount, solution, lcd, tv, mounting, hole,..."
2,1400532620,http://ecx.images-amazon.com/images/I/519ca3cu...,Barnes & Noble Nook eReader - no 3GMeet nook. ...,"[['Electronics', 'eBook Readers & Accessories']]",Barnes &amp; Noble Nook eReader - no 3G,74.95,{'Electronics': 23071},"{'also_bought': ['B0035CLBT4', 'B004X18N24', '...",Barnes &amp; Noble,"['eread', 'nook', 'access', 'wifi', 'brows', '...","[ereader, nook, access, wifi, browse, ebook, c..."
3,140053271X,http://ecx.images-amazon.com/images/I/51jat7CV...,Barnes & Noble Nook Simple Touch Wi-Fi ReaderI...,"[['Electronics', 'eBook Readers & Accessories'...",Barnes &amp; Noble Nook Simple Touch eBook Rea...,79.49,,"{'also_bought': ['B007UXNHNM', 'B007UXNHGY', '...",Barnes &amp; Noble,"['wifi', 'touch', 'batteri', 'life', 'ink', 'd...","[wifi, touch, battery, life, ink, display, w, ..."
4,1400532736,http://ecx.images-amazon.com/images/I/413fSdlM...,The NOOK Simple Touch eReader allows you to re...,"[['Electronics', 'eBook Readers & Accessories'...",Nook Simple Touch eReader,62.99,{'Electronics': 4945},"{'also_bought': ['B0055ZDRI2', 'B007UXNHGY', '...",Barnes &amp; Noble,"['eread', 'display', 'devic', 'conveni', 'read...","[ereader, display, device, convenience, readin..."


In [93]:
meta_data[meta_data['asin']=='0972683275']['stem_words'].iloc[0]

"['tv', 'mount', 'solut', 'lcd', 'tv', 'mount', 'hole', 'pattern', 'tv', 'gaug', 'steel', 'construct', 'safeti', 'display', 'twolink', 'arm', 'design', 'extens', 'angl', 'vesa', 'plate', 'instal', 'postinstal', 'level', 'adjust', 'tv', 'arm', 'cabl', 'manag', 'ring', 'system', 'design', 'cabl', 'hardwar', 'note', 'product']"

In [94]:
def get_stem_sim_words(pid, stem_words) :
    # Get the stem words of the product
    desc_stem=meta_data[meta_data['asin']==pid]['stem_words'].iloc[0]
    # Find the Intersection of words
    sim_words=list(set(desc_stem).intersection(stem_words))
    # return the words
    return sim_words

In [96]:
def get_lem_sim_words(pid, lem_words) :
    # Get the stem words of the product
    desc_lem=meta_data[meta_data['asin']==pid]['lem_words'].iloc[0]
    # Find the Intersection of words
    sim_words=list(set(desc_lem).intersection(lem_words))
    # return the words
    return sim_words

In [95]:
df_new['stem_sim_words'] = df_new.swifter.apply(lambda row: get_stem_sim_words(row['asin'],row['stem_words']), axis=1)

Pandas Apply: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 81275/81275 [04:18<00:00, 313.82it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [97]:
df_new['lem_sim_words'] = df_new.swifter.apply(lambda row: get_lem_sim_words(row['asin'],row['lem_words']), axis=1)

Pandas Apply: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 81275/81275 [04:05<00:00, 331.45it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [99]:
df_new.tail()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,helpfulness_score,user_deviation,review_delay,stem_words,lem_words,stem_sim_words,lem_sim_words
107497,APRNS6DB68LLV,B00L26YDA4,"Rob Slaven ""slavenrm@gmail. com""","[5, 10]",As usual I received this item for free in exch...,5.0,Works brilliantly once you understand what it ...,1404172800,"07 1, 2014",0.5,0.597583,432000,"[usual, receiv, item, free, exchang, review, a...","[usual, received, item, free, exchange, review...",[],"[receiver, speaker, get, wifi, need]"
107498,A3A4ZAIBQWKOZS,B00L26YDA4,Stephen M. Lerch,"[10, 15]",Want to add wireless audio streaming to your h...,5.0,Smart,1403827200,"06 27, 2014",0.666667,0.546834,86400,"[want, add, wireless, audio, stream, home, the...","[want, add, wireless, audio, streaming, home, ...",[],"[receiver, music, connection, wifi, sabrent, s..."
107499,A18R0PC372EGL,B00L3YHF6O,Blues,"[7, 11]",Wow...that's about all I have to say. Having ...,5.0,To use a technical term...Wow!,1404950400,"07 10, 2014",0.636364,0.448464,259200,"[wowthat, say, prior, experi, bt, speaker, rel...","[wowthats, say, prior, experience, bt, speaker...",[],"[fidelity, pair, performance, sound, roar]"
107500,A3OOQH73VQ97VN,B00L3YHF6O,"Jem ""Microsoft""","[5, 12]",Update 14 July 2014: they have a white paper a...,5.0,Purely Cons Review,1404777600,"07 8, 2014",0.416667,0.742179,86400,"[updat, 14, juli, 2014, white, paper, creative...","[update, 14, july, 2014, white, paper, creativ...",[g],"[sbx, control, audio, pair, power, panel, roar]"
107501,A3A4ZAIBQWKOZS,B00L3YHF6O,Stephen M. Lerch,"[18, 23]",My short review:If you have the money to spend...,5.0,Best sounding speaker at this price range,1404691200,"07 7, 2014",0.782609,0.546834,0,"[short, reviewif, money, spend, speaker, dont,...","[short, reviewif, money, spend, speaker, dont,...","[n, w]","[control, audio, pc, connection, pair, line, p..."


In [100]:
df_new.to_csv('Amazon_Latest_Data.csv',index=False)

In [112]:
def get_set_length(words) :
    set_length = len(set(words))
    if(set_length==0):
        return 1
    return set_length

In [102]:
get_set_length(['wow', 'wow', 'how'])

2

In [113]:
df_new['lem_set_length'] = df_new['lem_words'].swifter.apply(get_set_length)

Pandas Apply: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 81275/81275 [00:00<00:00, 88447.20it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [114]:
df_new.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,helpfulness_score,user_deviation,review_delay,stem_words,lem_words,stem_sim_words,lem_sim_words,lem_set_length,lem_sim_length,lem_sim
0,AMO214LNFCEI4,528881469,Amazon Customer,"[12, 15]","I'm a professional OTR truck driver, and I bou...",1.0,Very Disappointed,1290643200,"11 25, 2010",0.8,1.399189,6652800,"[im, profession, otr, truck, driver, bought, t...","[im, professional, otr, truck, driver, bought,...",[5],"[rand, route, weight, take, time, length, gps,...",160,13,12.307692
1,A3N7T0DY83Y4IG,528881469,C. A. Freeman,"[43, 45]","Well, what can I say. I've had this unit in m...",3.0,1st impression,1283990400,"09 9, 2010",0.955556,0.624437,0,"[well, say, ive, unit, truck, four, day, prior...","[well, say, ive, unit, truck, four, day, prior...",[],"[dock, feature, car, determine, home, rand, pa...",241,23,10.478261
2,A1H8PY3QHMQQA0,528881469,"Dave M. Shaw ""mack dave""","[9, 10]","Not going to write a long review, even thought...",2.0,"Great grafics, POOR GPS",1290556800,"11 24, 2010",0.9,0.510375,6566400,"[go, write, long, review, even, thought, unit,...","[going, write, long, review, even, thought, un...",[],"[screen, road, rand, route, truck, feature, tr...",159,16,9.9375
3,A3QH8VQDE7HZCR,972683275,costaricachris,"[15, 19]",Quality was excellent. Instructions were clear...,5.0,Real value for the money,1286236800,"10 5, 2010",0.789474,0.442171,2678400,"[qualiti, excel, instruct, clear, clear, tilt,...","[quality, excellent, instruction, clear, clear...",[],"[level, hardware]",31,2,15.5
4,A38FGQVJM18OWV,972683275,"George S. Mitchell ""gsmitchell""","[8, 18]",I checked around Amazon as well as some other ...,1.0,What a piece of junk!,1291161600,"12 1, 2010",0.444444,1.616074,7603200,"[check, around, amazon, well, site, decid, nee...","[checked, around, amazon, well, site, decided,...",[],"[tv, mount]",52,2,26.0


In [115]:
df_new['lem_sim_length'] = df_new['lem_sim_words'].swifter.apply(len)

Pandas Apply: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 81275/81275 [00:00<00:00, 571985.73it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [125]:
def get_lem_sim(lem_length,sim_length):
    return sim_length/lem_length

In [126]:
df_new['lem_sim'] = df_new.swifter.apply(lambda row: get_lem_sim(row['lem_set_length'],row['lem_sim_length']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [127]:
df_new.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,helpfulness_score,user_deviation,review_delay,stem_words,lem_words,stem_sim_words,lem_sim_words,lem_set_length,lem_sim_length,lem_sim
0,AMO214LNFCEI4,528881469,Amazon Customer,"[12, 15]","I'm a professional OTR truck driver, and I bou...",1.0,Very Disappointed,1290643200,"11 25, 2010",0.8,1.399189,6652800,"[im, profession, otr, truck, driver, bought, t...","[im, professional, otr, truck, driver, bought,...",[5],"[rand, route, weight, take, time, length, gps,...",160,13,0.08125
1,A3N7T0DY83Y4IG,528881469,C. A. Freeman,"[43, 45]","Well, what can I say. I've had this unit in m...",3.0,1st impression,1283990400,"09 9, 2010",0.955556,0.624437,0,"[well, say, ive, unit, truck, four, day, prior...","[well, say, ive, unit, truck, four, day, prior...",[],"[dock, feature, car, determine, home, rand, pa...",241,23,0.095436
2,A1H8PY3QHMQQA0,528881469,"Dave M. Shaw ""mack dave""","[9, 10]","Not going to write a long review, even thought...",2.0,"Great grafics, POOR GPS",1290556800,"11 24, 2010",0.9,0.510375,6566400,"[go, write, long, review, even, thought, unit,...","[going, write, long, review, even, thought, un...",[],"[screen, road, rand, route, truck, feature, tr...",159,16,0.100629
3,A3QH8VQDE7HZCR,972683275,costaricachris,"[15, 19]",Quality was excellent. Instructions were clear...,5.0,Real value for the money,1286236800,"10 5, 2010",0.789474,0.442171,2678400,"[qualiti, excel, instruct, clear, clear, tilt,...","[quality, excellent, instruction, clear, clear...",[],"[level, hardware]",31,2,0.064516
4,A38FGQVJM18OWV,972683275,"George S. Mitchell ""gsmitchell""","[8, 18]",I checked around Amazon as well as some other ...,1.0,What a piece of junk!,1291161600,"12 1, 2010",0.444444,1.616074,7603200,"[check, around, amazon, well, site, decid, nee...","[checked, around, amazon, well, site, decided,...",[],"[tv, mount]",52,2,0.038462


In [130]:
df_new['helpfulness_score'].corr(df_new['lem_sim_length'])

0.17191611968338297

In [121]:
df_new['lem_sim'].isnull().values.ravel().sum()

0

In [124]:
df_new.tail()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,helpfulness_score,user_deviation,review_delay,stem_words,lem_words,stem_sim_words,lem_sim_words,lem_set_length,lem_sim_length,lem_sim
107497,APRNS6DB68LLV,B00L26YDA4,"Rob Slaven ""slavenrm@gmail. com""","[5, 10]",As usual I received this item for free in exch...,5.0,Works brilliantly once you understand what it ...,1404172800,"07 1, 2014",0.5,0.597583,432000,"[usual, receiv, item, free, exchang, review, a...","[usual, received, item, free, exchange, review...",[],"[receiver, speaker, get, wifi, need]",82,5,16.4
107498,A3A4ZAIBQWKOZS,B00L26YDA4,Stephen M. Lerch,"[10, 15]",Want to add wireless audio streaming to your h...,5.0,Smart,1403827200,"06 27, 2014",0.666667,0.546834,86400,"[want, add, wireless, audio, stream, home, the...","[want, add, wireless, audio, streaming, home, ...",[],"[receiver, music, connection, wifi, sabrent, s...",129,9,14.333333
107499,A18R0PC372EGL,B00L3YHF6O,Blues,"[7, 11]",Wow...that's about all I have to say. Having ...,5.0,To use a technical term...Wow!,1404950400,"07 10, 2014",0.636364,0.448464,259200,"[wowthat, say, prior, experi, bt, speaker, rel...","[wowthats, say, prior, experience, bt, speaker...",[],"[fidelity, pair, performance, sound, roar]",99,5,19.8
107500,A3OOQH73VQ97VN,B00L3YHF6O,"Jem ""Microsoft""","[5, 12]",Update 14 July 2014: they have a white paper a...,5.0,Purely Cons Review,1404777600,"07 8, 2014",0.416667,0.742179,86400,"[updat, 14, juli, 2014, white, paper, creative...","[update, 14, july, 2014, white, paper, creativ...",[g],"[sbx, control, audio, pair, power, panel, roar]",414,7,59.142857
107501,A3A4ZAIBQWKOZS,B00L3YHF6O,Stephen M. Lerch,"[18, 23]",My short review:If you have the money to spend...,5.0,Best sounding speaker at this price range,1404691200,"07 7, 2014",0.782609,0.546834,0,"[short, reviewif, money, spend, speaker, dont,...","[short, reviewif, money, spend, speaker, dont,...","[n, w]","[control, audio, pc, connection, pair, line, p...",473,14,33.785714
