In [28]:
import pandas as pd
# import natural lenguage tookkit
import nltk 
nltk.download('wordnet') # 
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to C:\Users\javier.perez-
[nltk_data]     alvaro\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\javier.perez-
[nltk_data]     alvaro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\javier.perez-
[nltk_data]     alvaro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\javier.perez-
[nltk_data]     alvaro\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Stemming & Lemmatization

Stemming and Lemmatization are techniques to nomalize text.

reading -> read

Books -> book

Stories -> stori (from stemming) or story (from lemmatization)

More info [here](https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html)

## The problem with the bag-of-words approach

In [2]:
# toy training data
X_train = ['I love the book','This is a great book','The fit is great','I love the shoes']
y_train = ['books','books','clothings','clothings']

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
pd.DataFrame(data=x_dtm.toarray(),columns=vect.get_feature_names(),index=X_train)

Unnamed: 0,book,fit,great,is,love,shoes,the,this
I love the book,1,0,0,0,1,0,1,0
This is a great book,1,0,1,1,0,0,0,1
The fit is great,0,1,1,1,0,0,1,0
I love the shoes,0,0,0,0,1,1,1,0


In [11]:
# train a naive bayes model
from sklearn.naive_bayes import MultinomialNB
nb_clf = MultinomialNB()
nb_clf.fit(X_train_dtm,y_train)

MultinomialNB()

In [13]:
# toy testing data 
X_test = ['I like the book','Shoes are alright','I love the books','I lost a shoe']
X_test_dtm = vect.transform(X_test)
nb_clf.predict(X_test_dtm)

array(['books', 'clothings', 'clothings', 'books'], dtype='<U9')

The predictions for 'I love the books' and 'I lost a shoe' are wrong. Why? Because the model hasn't seen the words 'books' and 'shoe'

## Stemming

In [15]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [17]:
# initialize the stemmer
stemmer = PorterStemmer()

In [18]:
stemmer.stem('reading')

'read'

In [19]:
stemmer.stem('books')

'book'

In [20]:
# organize, organizes, and organizing
stemmer.stem('organize')

'organ'

In [21]:
stemmer.stem('organizes')

'organ'

In [22]:
stemmer.stem('organizing')

'organ'

The tokenizer breaks a sentence into its individual words

In [16]:
phrase = 'I love the books.'
words = word_tokenize(phrase)
words

['I', 'love', 'the', 'books', '.']

In [23]:
stemmed_words = [stemmer.stem(word) for word in words]
stemmed_words

['I', 'love', 'the', 'book', '.']

In [34]:
' '.join(stemmed_words)

'I love the book .'

## Lemmatization

In [24]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [25]:
lemmatizer = WordNetLemmatizer()

The lemmatizer expects the parts of speech; by default, each token is a noun

In [26]:
lemmatizer.lemmatize('eats', pos='v')

'eat'

In [27]:
lemmatizer.lemmatize('ate', pos='v')

'eat'

In [29]:
# parts of speech tagging
pos_list = nltk.pos_tag(words)
pos_list

[('I', 'PRP'), ('love', 'VBP'), ('the', 'DT'), ('books', 'NNS'), ('.', '.')]

In [31]:
def process_pos(pos):
    if pos.startswith('J'): # adjectives
        return wordnet.ADJ
    elif pos.startswith('V'): # verbes
        return wordnet.VERB
    elif pos.startswith('N'): # nouns
        return wordnet.NOUN
    elif pos.startswith('R'): # adverbs
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [33]:
lemmatized_words = [lemmatizer.lemmatize(word, pos=process_pos(pos)) for word,pos in nltk.pos_tag(words)]
lemmatized_words

['I', 'love', 'the', 'book', '.']

In [35]:
' '.join(lemmatized_words)

'I love the book .'

## Stopwords Removal

The set of most common words in english: this, that, he, it, ... They don't add much meaning to the sentences.

In [36]:
from nltk.corpus import stopwords

In [37]:
stop_words = stopwords.words('english')
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [38]:
phrase = 'here is an example sentence demostrating the removal of stopwords'
phrase

'here is an example sentence demostrating the removal of stopwords'

In [39]:
words = word_tokenize(phrase)
stripped_phrase = [word for word in words if word not in stop_words]
" ".join(stripped_phrase)

'example sentence demostrating removal stopwords'

## Punctuation removal

In [40]:
import string
punctuation = [punc for punc in string.punctuation]
punctuation

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~']

In [41]:
phrase = 'Hello! How are you?'
words = word_tokenize(phrase)
stripped_phrase = [word for word in words if word not in punctuation]
" ".join(stripped_phrase)

'Hello How are you'

## Yelp reviews

In [42]:
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/yelp.csv'
yelp = pd.read_csv(url)[['text','stars']]
yelp.head()

Unnamed: 0,text,stars
0,My wife took me here on my birthday for breakf...,5
1,I have no idea why some people give bad review...,5
2,love the gyro plate. Rice is so good and I als...,4
3,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",5
4,General Manager Scott Petello is a good egg!!!...,5


In [43]:
# reviews that only contains the 5-stars and 1-star reviews
yelp = yelp[yelp.stars.isin(['1','5'])].reset_index(drop=True)
yelp

Unnamed: 0,text,stars
0,My wife took me here on my birthday for breakf...,5
1,I have no idea why some people give bad review...,5
2,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",5
3,General Manager Scott Petello is a good egg!!!...,5
4,Drop what you're doing and drive here. After I...,5
...,...,...
4081,Yes I do rock the hipster joints. I dig this ...,5
4082,Only 4 stars? \n\n(A few notes: The folks that...,5
4083,I'm not normally one to jump at reviewing a ch...,5
4084,Let's see...what is there NOT to like about Su...,5


In [44]:
yelp['processed_text'] = yelp['text']

In [45]:
for i in range(len(yelp)):
    text = yelp.loc[i,'text']
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    lemmatized_words = [lemmatizer.lemmatize(word, pos=process_pos(pos)) 
                        for word,pos in nltk.pos_tag(words) 
                        if word not in stop_words and word not in punctuation]
    yelp.loc[i,'processed_text'] = ' '.join(lemmatized_words)

In [46]:
print(yelp.text[0])

My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.

Do yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I've ever had.  I'm pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.

While EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  It was the best "toast" I've ever had.

Anyway, I can't wait to go back!


In [47]:
print(yelp.processed_text[0])

wife take birthday breakfast excellent weather perfect make sit outside overlook ground absolute pleasure waitress excellent food arrive quickly semi-busy saturday morning look like place fill pretty quickly early get good favor get bloody mary phenomenal simply best 've ever 'm pretty sure use ingredient garden blend fresh order amaze everything menu look excellent white truffle scramble egg vegetable skillet tasty delicious come 2 piece griddle bread amaze absolutely make meal complete best `` toast '' 've ever anyway ca n't wait go back


In [48]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split

In [50]:
y = yelp.stars
X = yelp.processed_text

In [51]:
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [53]:
pipe = Pipeline(steps=[
    ('vect', CountVectorizer(max_features=5000,ngram_range=(1,2))), 
    ('clf', MultinomialNB()) 
])

In [54]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(max_features=5000, ngram_range=(1, 2))),
                ('clf', MultinomialNB())])

In [55]:
y_test_pred = pipe.predict(X_test)

In [56]:
confusion_matrix(y_test,y_test_pred)

array([[142,  31],
       [ 36, 813]], dtype=int64)

In [57]:
accuracy_score(y_test,y_test_pred)

0.9344422700587084

### How does the model choose between 5-stars or 1-star ratings

In [58]:
# store the vocabulary of X_train
words = pipe['vect'].get_feature_names()

In [60]:
pipe['clf'].classes_

array([1, 5], dtype=int64)

In [61]:
# number of times each word appears across all 1-star docs
bad_word_count = pipe['clf'].feature_count_[0,:]
# number of times each word appears across all 5-stars docs
good_word_count = pipe['clf'].feature_count_[1,:]

In [62]:
# create a DataFrame of words with their separate 1-star and 5-stars counts
words = pd.DataFrame({'word' : words, 'bad' : bad_word_count, 'good' : good_word_count}).set_index('word')
words.head()

Unnamed: 0_level_0,bad,good
word,Unnamed: 1_level_1,Unnamed: 2_level_1
00,31.0,33.0
000,4.0,8.0
00pm,1.0,7.0
07,2.0,5.0
10,78.0,128.0


In [63]:
# add 1 to the columns counts to avoid dividing by 0
words.bad = words.bad+1
words.good = words.good+1

In [64]:
# convert the counts into frequencies
words.bad = words.bad/words.bad.sum()
words.good = words.good/words.good.sum()
words.head()

Unnamed: 0_level_0,bad,good
word,Unnamed: 1_level_1,Unnamed: 2_level_1
00,0.000655,0.000219
000,0.000102,5.8e-05
00pm,4.1e-05,5.2e-05
07,6.1e-05,3.9e-05
10,0.001617,0.000831


In [65]:
# ratios
words['bad_ratio'] = words.bad/words.good
words['good_ratio'] = words.good/words.bad

In [66]:
words.sort_values(by='good_ratio', ascending=False).head(20)

Unnamed: 0_level_0,bad,good,bad_ratio,good_ratio
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fantastic,4.1e-05,0.001269,0.032262,30.996308
perfect,6.1e-05,0.001617,0.037982,26.328505
one best,2e-05,0.000438,0.046732,21.398467
yum,2e-05,0.000354,0.057778,17.307583
love place,6.1e-05,0.000921,0.066667,14.999906
fruit,2e-05,0.000303,0.067613,14.790117
favorite,0.000184,0.002461,0.07487,13.356559
pasty,2e-05,0.000271,0.075662,13.2167
dentist,2e-05,0.000258,0.079445,12.587333
ca wait,4.1e-05,0.000496,0.08254,12.115308


In [67]:
words.sort_values(by='bad_ratio', ascending=False).head(20)

Unnamed: 0_level_0,bad,good,bad_ratio,good_ratio
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one star,0.000389,6e-06,60.378158,0.016562
ugh,0.000348,6e-06,54.022563,0.018511
staffperson,0.000348,6e-06,54.022563,0.018511
acknowledge,0.000287,6e-06,44.489169,0.022477
fuck,0.000246,6e-06,38.133574,0.026224
never return,0.000246,6e-06,38.133574,0.026224
rude,0.000962,2.6e-05,37.339124,0.026782
disgust,0.000676,1.9e-05,34.955776,0.028608
fuse,0.000225,6e-06,34.955776,0.028608
service horrible,0.000225,6e-06,34.955776,0.028608
