# Text Analysis

In this module, we will use the Natural Language Toolkit Library (NLTK) to look at individual words and sentences in a text and clean unneccessary features from the text data to prepare for sentiment analysis. Then using the textblob library, we will analyze the sentiment of opinioned data to give a numerical value for use in a predictive model.

The NLTK library was built to separate punctuation from words when tokenizing (splitting into parts).

In [29]:
#import libraries
import pandas as pd
import numpy as np


import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#this is sample data
from nltk.corpus import names  

from string import punctuation

#if the next cell does not work
#remove number symbol on following lines and re-run this cell
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('names')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\GBTC408006ur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\GBTC408006ur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package names to
[nltk_data]     C:\Users\GBTC408006ur\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\GBTC408006ur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\GBTC408006ur\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [30]:
df = pd.read_csv("./datasets/Womens Clothing E-Commerce Reviews.csv")

In [31]:
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [32]:
df.isnull().sum()

Unnamed: 0                    0
Clothing ID                   0
Age                           0
Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                14
Department Name              14
Class Name                   14
dtype: int64

In [34]:
df["Review Text"].head()

0    Absolutely wonderful - silky and sexy and comf...
1    Love this dress!  it's sooo pretty.  i happene...
2    I had such high hopes for this dress and reall...
3    I love, love, love this jumpsuit. it's fun, fl...
4    This shirt is very flattering to all due to th...
Name: Review Text, dtype: object

In [None]:
#df = df.dropna(subset = df["Review Text"])  #Not working

In [39]:
df["Review Text"] = df[["Review Text"]].dropna()

In [40]:
df.isnull().sum()

Unnamed: 0                    0
Clothing ID                   0
Age                           0
Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                14
Department Name              14
Class Name                   14
dtype: int64

In [42]:
df[df["Review Text"].isnull()].head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
92,92,861,23,,,5,1,0,General Petite,Tops,Knits
93,93,1081,31,,,5,1,0,General,Dresses,Dresses
98,98,1133,50,,,5,1,0,General,Jackets,Outerwear
135,135,861,35,,,4,1,0,General Petite,Tops,Knits
142,142,1126,35,,,5,1,0,General,Jackets,Outerwear


In [43]:
df_nonull = df[df["Review Text"].notnull()]

In [44]:
df_nonull.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [46]:
df_nonull.isnull().sum()
df = df_nonull

In [41]:
#df["Review Text"].isnull().sum()

845

In [47]:
df.tail(15)

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
23469,23469,262,50,Comfy and cute,My size was not available so based on reviews ...,4,1,0,General Petite,Intimate,Lounge
23471,23471,262,31,Awkward fit for me,"Love the way these pants look in the pictures,...",4,1,0,General Petite,Intimate,Lounge
23472,23472,855,32,Perfectly drapey,I saw the shirt on the retailer website and ne...,5,1,0,General,Tops,Knits
23473,23473,1104,29,Perfect dress,Great quality and extremely flattering. bonus ...,5,1,1,General Petite,Dresses,Dresses
23474,23474,1104,32,Much better in person!,"Yes, this is a great dress! i wasn't sure abou...",5,1,0,General Petite,Dresses,Dresses
23475,23475,1104,41,Cute dress,Cute dress but not for me. the waist is too h...,3,1,0,General Petite,Dresses,Dresses
23476,23476,522,27,Cheeky!,These bottoms are very cute but defiantly chee...,4,1,0,Initmates,Intimate,Swim
23477,23477,1094,39,Entrancing,I'm so impressed with the beautiful color comb...,4,1,5,General Petite,Dresses,Dresses
23478,23478,1104,32,Unflattering,I was surprised at the positive reviews for th...,1,0,0,General Petite,Dresses,Dresses
23479,23479,1005,42,What a fun piece!,So i wasn't sure about ordering this skirt bec...,5,1,0,General Petite,Bottoms,Skirts


In [48]:
eng_stopwords = stopwords.words('english')
eng_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [49]:
#initilize function to do sentiment analysis
sid = SentimentIntensityAnalyzer()

In [50]:
 #create a function to clean up each review
#then it will analyze and assign a sentiment polarity
def reviewSentiment(review):
    
    #make text lowercase
    review = review.lower()
    
    #tokenize the review
    tknz_review = word_tokenize(review)
    
    #remove puntuation
    for token in tknz_review:
        if token in punctuation:
            tknz_review.remove(token)
    
    clean_tokens = []
    #remove filler words
    for token in tknz_review:
        if token not in eng_stopwords:
            clean_tokens.append(token)
            
    #put sentence back together with remaining clean words
    clean_review = ' '.join(clean_tokens)
    #clean_review = ' '.join(tknz_review)
    
    #turn into textblob
    sid_rev = sid.polarity_scores(clean_review)
    
    #get sentiment polarity
    r_comp = sid_rev['compound']
    
    return r_comp

In [51]:
df

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses
5,5,1080,49,Not for the very petite,"I love tracy reese dresses, but this one is no...",2,0,4,General,Dresses,Dresses
6,6,858,39,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,5,1,1,General Petite,Tops,Knits
7,7,858,39,"Shimmer, surprisingly goes with lots","I ordered this in carbon for store pick up, an...",4,1,4,General Petite,Tops,Knits
8,8,1077,24,Flattering,I love this dress. i usually get an xs but it ...,5,1,0,General,Dresses,Dresses
9,9,1077,34,Such a fun dress!,"I'm 5""5' and 125 lbs. i ordered the s petite t...",5,1,0,General,Dresses,Dresses


In [52]:
 #create a new column to hold sentiment value from function
df['review_sentiment'] = df["Review Text"].apply(reviewSentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [54]:
#sample the first 5 items in m_names
#m_names[:5]
df[:5]

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,review_sentiment
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,0.8991
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,0.971
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,0.9062
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,0.9464
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,0.9117


In [None]:
#make a frequency distribution of names that end with a particular letter (by gender)
#cfd = nltk.ConditionalFreqDist(
#            (fileid, name[-1])
#            for fileid in names.fileids()
#            for name in names.words(fileid))

#cfd.plot()

### Sentiment Analysis

In order to understand how people feel about something, we need to do sentiment analysis on text data that contains their opinion.

In [None]:
#initilize function to do sentiment analysis
sid = SentimentIntensityAnalyzer()

In [None]:
myday = "Today is a great day, but it is boring"

In [None]:
sid.polarity_scores(myday)

In [None]:
#extract the sentiment value from the dictionary of scores
sid.polarity_scores(myday)['compound']

In [None]:
vd_comp = sid.polarity_scores(myday)['compound']
type(vd_comp)

#### Make a sentiment value column in a dataframe

Using the [Amazon Book Reviews dataset on Kaggle](https://www.kaggle.com/shrutimehta/amazon-book-reviews-webscraped), we add a new column to the dataset that will have a numerical value for the sentiment of each review.

In [None]:
import pandas as pd

#load the data from the Reviews.csv file
filepath = "Reviews.csv"
df = pd.read_csv(filepath, encoding = "latin-1") #this file is encoded differently

df.head()

In [None]:
#create a function to clean up each review
#then it will analyze and assign a sentiment polarity
def reviewSentiment(review):
    
    #make text lowercase
    review = review.lower()
    
    #tokenize the review
    tknz_review = word_tokenize(review)
    
    #remove puntuation
    for token in tknz_review:
        if token in punctuation:
            tknz_review.remove(token)
    
    clean_tokens = []
    #remove filler words
    for token in tknz_review:
        if token not in eng_stopwords:
            clean_tokens.append(token)
            
    #put sentence back together with remaining clean words
    clean_review = ' '.join(clean_tokens)
    #clean_review = ' '.join(tknz_review)
    
    #turn into textblob
    sid_rev = sid.polarity_scores(clean_review)
    
    #get sentiment polarity
    r_comp = sid_rev['compound']
    
    return r_comp

In [None]:
#create a new column to hold sentiment value from function
df['review_sentiment'] = df['ReviewContent'].apply(reviewSentiment)

In [None]:
#erify sentiment values in new column
df.head()

In [None]:
#create a function to assign a polarity category to the sentiment
def sentimentCategory(sent_num):
    if sent_num >= 0.2:
        return "positive"
    if sent_num <= -0.2:
        return "negative"
    else:
        return "neutral"

In [None]:
#create a new column to hold sentiment category
df['sentiment_category'] = df['review_sentiment'].apply(sentimentCategory)

In [None]:
df.head()

In [None]:
df['ReviewContent'].iloc[0]

In [None]:
df['ReviewContent'].iloc[4]

In [None]:
#compare frequency of positive, negative, and neutral reviews
df['sentiment_category'].value_counts()

Overall, it seems that most readers feel so-so about the book (maybe some good parts and some bad parts) and some readers really like the book.