In [1]:
import pandas as pd 

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#this is sample data
from nltk.corpus import names  

from string import punctuation



In [2]:
file = 'datasets/women_clothing_review.csv'
clothes_na = pd.read_csv(file)

In [3]:
clothes_na.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [4]:
clothes = clothes_na.dropna(subset = ['Review Text'])

In [5]:
def clean_text(review): 
    '''get non-filler words one review at a time'''
    review = str(review)
    review = review.strip()
    review = review.lower()
    rw_tks = word_tokenize(review)
    
    for token in rw_tks: 
        if token in punctuation: 
            rw_tks.remove(token)
            
    clean_review = []
    
    stopwords_eng = stopwords.words('english')
    
    for token in rw_tks: 
        if token not in stopwords_eng: 
            clean_review.append(token)
            
    review = ' '.join(clean_review)
    return review 

In [6]:
clean_reviews = clothes['Review Text'].apply(clean_text)

In [7]:
clean_reviews.head()


0          absolutely wonderful silky sexy comfortable
1    love dress 's sooo pretty happened find store ...
2    high hopes dress really wanted work initially ...
3    love love love jumpsuit 's fun flirty fabulous...
4    shirt flattering due adjustable front tie perf...
Name: Review Text, dtype: object

In [8]:
def sentiment_val(review): 
    review = clean_text(review)
    sid = SentimentIntensityAnalyzer()
    
    #get the polarity scores
    sid_rev = sid.polarity_scores(review)
    
    #get the compound score 
    r_comp = sid_rev['compound']
    return r_comp
    

In [9]:
clean_reviews = clothes['Review Text'].apply(sentiment_val)

In [10]:
clean_reviews[:10]

0    0.8991
1    0.9710
2    0.9062
3    0.9464
4    0.9117
5    0.9153
6    0.6361
7    0.5709
8    0.7579
9    0.9643
Name: Review Text, dtype: float64

In [11]:
clothes['Sentiment Value'] = clean_reviews 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [12]:
clothes.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Sentiment Value
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,0.8991
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,0.971
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,0.9062
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,0.9464
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,0.9117


In [13]:
def sen_bins(sen_val): 
    sentiment = 0 
    if sen_val > 0.5: 
        sentiment = 'positive' 
    elif sen_val < -0.3: 
        sentiment = 'negative'
    else: 
        sentiment = 'neutral' 
    return sentiment
    

In [14]:
clothes['Sentiment Category'] = clothes['Sentiment Value'].apply(sen_bins)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [15]:
clothes.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Sentiment Value,Sentiment Category
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,0.8991,positive
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,0.971,positive
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,0.9062,positive
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,0.9464,positive
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,0.9117,positive


In [16]:
clothes['Sentiment Category'].value_counts()

positive    20024
neutral      2192
negative      425
Name: Sentiment Category, dtype: int64