In [1]:
import pandas as pd
import numpy as np

In [2]:
## Stemming and Lemmatization

In [3]:
# install the nltk (natural language toolkit) library
!pip install nltk



In [5]:
# training data
# sentences
X_train = ['I love the book', 
           'This is a great book',
          'The fit is great',
          'I love the shoes']
# topics of sentences
y_train = ['books', 'books', 'clothings', 'clothings']

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [9]:
vect = CountVectorizer()
vect.fit(X_train)
# document term matrix. Matrix with the counts
X_train_dtm = vect.transform(X_train)
pd.DataFrame(X_train_dtm.toarray(), 
             columns=vect.get_feature_names_out())

Unnamed: 0,book,fit,great,is,love,shoes,the,this
0,1,0,0,0,1,0,1,0
1,1,0,1,1,0,0,0,1
2,0,1,1,1,0,0,1,0
3,0,0,0,0,1,1,1,0


In [13]:
X_test = ['I like the book', 
          'Shoes are alright', 
          'I love the books',
         'I lost a shoe']

In [20]:
nb_clf = MultinomialNB()
nb_clf.fit(X=X_train_dtm, y=y_train)

In [22]:
X_test_dtm = vect.transform(X_test)
nb_clf.predict(X_test_dtm)

array(['books', 'clothings', 'clothings', 'books'], dtype='<U9')

## Stemming

In [50]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Derrick\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Derrick\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Derrick\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Derrick\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Derrick\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [24]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [29]:
# initialize the stemmer
stemmer = PorterStemmer()
print(stemmer.stem('books'))
print(stemmer.stem('reading'))

book
read


In [33]:
phrase = 'I love the books'
words = word_tokenize(phrase)

In [36]:
stemmed_words = [stemmer.stem(word) for word in words] # some nice python syntax
stemmed_words

['i', 'love', 'the', 'book']

In [37]:
' '.join(stemmed_words)

'i love the book'

## Lemmatization

In [46]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [47]:
lemmatizer = WordNetLemmatizer()

In [48]:
# needs parts of speech
lemmatizer.lemmatize('eats', pos='v') # v for verb

'eat'

In [49]:
words

['I', 'love', 'the', 'books']

In [51]:
nltk.pos_tag(words) # tells if a word is a verb (VBP) or a noun (NNS)

[('I', 'PRP'), ('love', 'VBP'), ('the', 'DT'), ('books', 'NNS')]

In [55]:
# parts of speech processing function
def process_pos(pos):
    if pos.startswith('J'): # adjective
        return wordnet.ADJ
    elif pos.startswith('V'): # verb
        return wordnet.VERB
    elif pos.startswith('N'): # noun
        return wordnet.NOUN
    elif pos.startswith('R'): #adverb
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [59]:
nltk.pos_tag(words)

[('I', 'PRP'), ('love', 'VBP'), ('the', 'DT'), ('books', 'NNS')]

In [60]:
lemmatized_words = [lemmatizer.lemmatize(word, pos=process_pos(pos))
                   for word,pos in nltk.pos_tag(words)]

In [61]:
phrase = 'example of sentence removal of stopwords'

In [70]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words

words = word_tokenize(phrase)
stripped_phrase = [word for word in words if word not in stop_words]
' '.join(stripped_phrase)

'Hello ! ?'

## Punctuation Removal

In [74]:

import string
punctuation = [punc for punc in string.punctuation]
punctuation

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~']

In [75]:
phrase = "Hello! how are you?"
words = word_tokenize(phrase)
stripped_phrease = [word for word in words if word not in punctuation]
' '.join(stripped_phrase)

'Hello ! ?'

## Yelp reviews


In [76]:
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/yelp.csv'
yelp = pd.read_csv(url)[['text','stars']]
yelp.head()

Unnamed: 0,text,stars
0,My wife took me here on my birthday for breakf...,5
1,I have no idea why some people give bad review...,5
2,love the gyro plate. Rice is so good and I als...,4
3,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",5
4,General Manager Scott Petello is a good egg!!!...,5


In [77]:
yelp.stars.value_counts()

4    3526
5    3337
3    1461
2     927
1     749
Name: stars, dtype: int64

In [83]:
# Filter to 5- and 1- star reviews
yelp = yelp.loc[yelp.stars.isin([1,5])]
yelp.tail(5)

Unnamed: 0,text,stars
9990,Yes I do rock the hipster joints. I dig this ...,5
9991,Only 4 stars? \n\n(A few notes: The folks that...,5
9992,I'm not normally one to jump at reviewing a ch...,5
9994,Let's see...what is there NOT to like about Su...,5
9999,4-5 locations.. all 4.5 star average.. I think...,5


In [96]:
text = yelp.loc[yelp.stars==5].iloc[0].text
text

'My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.\n\nDo yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I\'ve ever had.  I\'m pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.\n\nWhile EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  It was the best "toast" I\'ve ever had.\n\nAnyway, I can\'t wait to go back!'

In [93]:
# Process the text
text = yelp.loc[0,'text']
words = word_tokenize(text)
words = [word.lower() for word in words]
lemmatized_words = [lemmatizer.lemmatize(word,pos=process_pos(pos))
                   for word,pos in nltk.pos_tag(words)
                   if word not in stop_words and word not in punctuation
                   ]
' '.join(lemmatized_words)

"wife take birthday breakfast excellent weather perfect make sit outside overlook ground absolute pleasure waitress excellent food arrive quickly semi-busy saturday morning look like place fill pretty quickly early get good favor get bloody mary phenomenal simply best 've ever 'm pretty sure use ingredient garden blend fresh order amaze everything menu look excellent white truffle scramble egg vegetable skillet tasty delicious come 2 piece griddle bread amaze absolutely make meal complete best `` toast '' 've ever anyway ca n't wait go back"

In [None]:
yelp['processed_text'] = yelp.text

In [97]:
def process_text(text):
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    lemmatized_words = [lemmatizer.lemmatize(word,pos=process_pos(pos))
                       for word,pos in nltk.pos_tag(words)
                       if word not in stop_words and word not in punctuation
                       ]
    return(' '.join(lemmatized_words))

In [98]:
process_text(text)

"wife take birthday breakfast excellent weather perfect make sit outside overlook ground absolute pleasure waitress excellent food arrive quickly semi-busy saturday morning look like place fill pretty quickly early get good favor get bloody mary phenomenal simply best 've ever 'm pretty sure use ingredient garden blend fresh order amaze everything menu look excellent white truffle scramble egg vegetable skillet tasty delicious come 2 piece griddle bread amaze absolutely make meal complete best `` toast '' 've ever anyway ca n't wait go back"

In [100]:
yelp['processed_text'] = yelp.text.apply(process_text)

In [101]:
yelp

Unnamed: 0,text,stars,processed_text
0,My wife took me here on my birthday for breakf...,5,wife take birthday breakfast excellent weather...
1,I have no idea why some people give bad review...,5,idea people give bad review place go show plea...
3,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",5,rosie dakota love chaparral dog park 's conven...
4,General Manager Scott Petello is a good egg!!!...,5,general manager scott petello good egg go deta...
6,Drop what you're doing and drive here. After I...,5,drop 're drive eat go back next day food good ...
...,...,...,...
9990,Yes I do rock the hipster joints. I dig this ...,5,yes rock hipster joint dig place little bit sc...
9991,Only 4 stars? \n\n(A few notes: The folks that...,5,4 star note folk rat place low must isolate in...
9992,I'm not normally one to jump at reviewing a ch...,5,'m normally one jump review chain restaurant e...
9994,Let's see...what is there NOT to like about Su...,5,let 's see ... like surprise stadium well 9.50...


In [129]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB # or any other classifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split


In [135]:
X = yelp.processed_text
y = yelp.stars

In [136]:
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [151]:
pipe = Pipeline(steps=[
    ('vectorizer', TfidfVectorizer(max_features = 1000, ngram_range=(1,2))), # idk what ngram range is. less features better for TfidVectorizer
    ('clf', MultinomialNB())
])

In [152]:
pipe.fit(X_train,y_train)

In [153]:
y_test_pred = pipe.predict(X_test)
confusion_matrix(y_test,y_test_pred)

array([[ 61, 138],
       [  1, 822]], dtype=int64)

## Grid search on TfidfVectorizer

In [154]:
params_dic = {'vectorizer__max_features':[500,1000,2000,4000],
             'vectorizer__ngram_range': [(1,1), (1,2)],
             'vectorizer__use_idf': [False, True], # False (CountVectorizer), True (TfidfVectorizer)
              'clf__alpha': [0.1,0.25,0.5,0.75],
             }

In [None]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(pipe, params_dic, cv=5, n_jobs=-1, scoring='roc_auc', verbose=2)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


## How does the Naive Bayes model choose 5 stars and 1 star?

In [110]:
# store the vocabulary
words = pipe['vectorizer'].get_feature_names_out()


['00',
 '000',
 '07',
 '10',
 '10 15',
 '10 min',
 '10 minute',
 '10 year',
 '100',
 '101',
 '11',
 '11am',
 '12',
 '120',
 '13',
 '14',
 '140',
 '15',
 '15 20',
 '15 minute',
 '15 year',
 '150',
 '16',
 '17',
 '18',
 '19',
 '1st',
 '20',
 '20 minute',
 '20 year',
 '200',
 '2010',
 '2011',
 '2012',
 '21',
 '22',
 '24',
 '25',
 '2am',
 '2nd',
 '30',
 '30 min',
 '30 minute',
 '30 year',
 '30pm',
 '32',
 '35',
 '3rd',
 '40',
 '40 minute',
 '45',
 '45 minute',
 '49',
 '4pm',
 '50',
 '500',
 '5th',
 '60',
 '65',
 '6pm',
 '70',
 '75',
 '7th',
 '80',
 '90',
 '90 minute',
 '95',
 '99',
 'ability',
 'able',
 'able get',
 'able make',
 'absolute',
 'absolute favorite',
 'absolutely',
 'absolutely amazing',
 'absolutely delicious',
 'absolutely love',
 'ac',
 'accent',
 'accept',
 'accessory',
 'accommodate',
 'accomodating',
 'accompany',
 'accord',
 'account',
 'acknowledge',
 'across',
 'across street',
 'act',
 'action',
 'active',
 'activity',
 'actual',
 'actually',
 'actually get',
 'actua

In [111]:
pipe['clf'].classes_

array([1, 5], dtype=int64)

In [115]:
# number of times each word appears across all 1 star doc
bad_word_count = pipe['clf'].feature_count_[0,:]
good_word_count = pipe['clf'].feature_count_[1,:]

In [118]:
words_df = pd.DataFrame({'word':words,
                        'bad':bad_word_count,
                         'good':good_word_count}).set_index('word')
words_df

Unnamed: 0_level_0,bad,good
word,Unnamed: 1_level_1,Unnamed: 2_level_1
00,32.0,32.0
000,4.0,8.0
07,2.0,5.0
10,73.0,129.0
10 15,2.0,6.0
...,...,...
zero,13.0,6.0
zinburger,0.0,9.0
zone,1.0,6.0
zoo,0.0,7.0


In [119]:
# Add 1 to the columns
words_df = words_df + 1

In [120]:
words_df

Unnamed: 0_level_0,bad,good
word,Unnamed: 1_level_1,Unnamed: 2_level_1
00,33.0,33.0
000,5.0,9.0
07,3.0,6.0
10,74.0,130.0
10 15,3.0,7.0
...,...,...
zero,14.0,7.0
zinburger,1.0,10.0
zone,2.0,7.0
zoo,1.0,8.0


In [121]:
# convert counts into frequencies
words_df.bad = words_df.bad/words_df.bad.sum()
words_df.good = words_df.good/words_df.good.sum()

In [123]:
# ratios
words_df['bad_ratio'] = words_df.bad/words_df.good
words_df['good_ratio'] = words_df.good/words_df.bad

In [126]:
words_df.sort_values(by='good_ratio', ascending=False).head(20)

Unnamed: 0_level_0,bad,good,bad_ratio,good_ratio
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fantastic,4e-05,0.001303,0.030478,32.810409
perfect,7.9e-05,0.001565,0.050755,19.702329
yum,2e-05,0.000358,0.055514,18.013558
outstanding,2e-05,0.000313,0.063444,15.761863
mozzarella,2e-05,0.000275,0.072297,13.831839
pasty,2e-05,0.000262,0.075824,13.188498
lamb,2e-05,0.000262,0.075824,13.188498
gem,2e-05,0.000256,0.077719,12.866827
favorite,0.000199,0.002498,0.079508,12.577324
bianco,2e-05,0.000249,0.079712,12.545157


In [127]:
# find some text with 'mozzerella'
yelp.loc[yelp.processed_text.str.contains('mozzarella')]

Unnamed: 0,text,stars,processed_text
30,"Disclaimer: Like many of you, I am a sucker fo...",5,disclaimer like many sucker charm little home ...
138,I'm from Chicago so I'm picky with my pizza--t...,5,'m chicago 'm picky pizza -- place right ny st...
205,"On one of my many visits to see mi amore, he t...",5,one many visit see mi amore take fantastic lit...
452,WOW this place is good! SO good! And not jus...,5,wow place good good yummy good intrinsically g...
816,I love this place! I love that it's to-go only...,5,love place love 's to-go 3 hour wait like pizz...
913,"The new Harkins Cine Capri, one of the first t...",5,new harkins cine capri one first thing open te...
1189,I have been to this place many times and the f...,5,place many time food service always great exci...
1222,"Honestly, this is the best pizza that I've had...",5,honestly best pizza 've arizona 'm sucker wood...
1611,Great product! I was on a mission to make home...,5,great product mission make homemade mozzarella...
1650,"Delicious food, amazing martini's, and wonderf...",5,delicious food amaze martini 's wonderful dess...
