In [None]:
import pandas as pd
# import NLTK (natural language toolkit)
import nltk 
nltk.download('wordnet') # 
nltk.download('stopwords')
nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4') # open multilingual wordnet library

# Stemming & Lemmatization

Stemming and Lemmatization are techniques to nomalize text.

reading -> read

Books -> book

Stories -> stori (from stemming) or story (from lemmatization)

More info [here](https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html)

## The problem with the bag-of-words approach

In [None]:
# toy training data
X_train = ['I love the book',
           'This is a great book',
           'The fit is great',
           'I love the shoes']
y_train = ['books',
           'books',
           'clothings',
           'clothings']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
pd.DataFrame(data=X_train_dtm.toarray(),
             columns=vect.get_feature_names_out(),
             index=X_train)

In [None]:
# train a naive bayes model
from sklearn.naive_bayes import MultinomialNB
nb_clf = MultinomialNB()
nb_clf.fit(X_train_dtm,y_train)

In [None]:
# toy testing data 
X_test = ['I like the book',
          'Shoes are alright',
          'I love the books',
          'I lost a shoe']

X_test_dtm = vect.transform(X_test)
nb_clf.predict(X_test_dtm)

The predictions for 'I love the books' and 'I lost a shoe' are wrong. Why? Because the model hasn't seen the words 'books' and 'shoe'

## Stemming

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [None]:
# initialize the stemmer
stemmer = PorterStemmer()

In [None]:
stemmer.stem('reading')

In [None]:
stemmer.stem('books')

In [None]:
# organize, organizes, and organizing
stemmer.stem('organize')

In [None]:
stemmer.stem('organizes')

In [None]:
stemmer.stem('organizing')

The tokenizer breaks a sentence into its individual words

In [None]:
phrase = 'I love the books.'
words = word_tokenize(phrase)
words

In [None]:
stemmed_words = [stemmer.stem(word) for word in words]
stemmed_words

In [None]:
' '.join(stemmed_words)

## Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [None]:
lemmatizer = WordNetLemmatizer()

The lemmatizer expects the parts of speech; by default, each token is a noun

In [None]:
lemmatizer.lemmatize('eats', pos='v')

In [None]:
lemmatizer.lemmatize('ate', pos='v')

In [None]:
# parts of speech tagging
pos_list = nltk.pos_tag(words)
pos_list

In [None]:
# process parts of speech function
def process_pos(pos):
    if pos.startswith('J'): # adjectives
        return wordnet.ADJ
    elif pos.startswith('V'): # verbes
        return wordnet.VERB
    elif pos.startswith('N'): # nouns
        return wordnet.NOUN
    elif pos.startswith('R'): # adverbs
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
words

In [None]:
lemmatized_words = [lemmatizer.lemmatize(word, pos=process_pos(pos)) 
                    for word,pos 
                    in nltk.pos_tag(words)]
lemmatized_words

In [None]:
' '.join(lemmatized_words)

## Stopwords Removal

The set of most common words in english: this, that, he, it, ... They don't add much meaning to the sentences.

In [None]:
from nltk.corpus import stopwords

In [None]:
stop_words = stopwords.words('english')
stop_words

In [None]:
phrase = 'here is an example sentence demostrating the removal of stopwords'
phrase

In [None]:
words = word_tokenize(phrase)
stripped_phrase = [word for word in words if word not in stop_words]
" ".join(stripped_phrase)

## Punctuation removal

In [None]:
import string
punctuation = [punc for punc in string.punctuation]
punctuation

In [None]:
phrase = 'Hello! How are you?'
words = word_tokenize(phrase)
stripped_phrase = [word for word in words if word not in punctuation]
" ".join(stripped_phrase)

## Yelp reviews

In [None]:
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/yelp.csv'
yelp = pd.read_csv(url)[['text','stars']]
yelp.head()

In [None]:
yelp.stars

In [None]:
# keep reviews that only contains the 5-stars and 1-star reviews
yelp = yelp[yelp.stars.isin([1,5])].reset_index(drop=True)
yelp

In [None]:
print(yelp.loc[0,'text'])

In [None]:
text = yelp.loc[0,'text']
words = word_tokenize(text)
words = [word.lower() for word in words]
lemmatized_words = [lemmatizer.lemmatize(word, pos=process_pos(pos)) 
                    for word,pos in nltk.pos_tag(words) 
                    if word not in stop_words and word not in punctuation]
print(' '.join(lemmatized_words))

In [None]:
for i in range(len(yelp)):
    text = yelp.loc[i,'text']
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    lemmatized_words = [lemmatizer.lemmatize(word, pos=process_pos(pos)) 
                        for word,pos in nltk.pos_tag(words) 
                        if word not in stop_words and word not in punctuation]
    yelp.loc[i,'processed_text'] = ' '.join(lemmatized_words)

In [None]:
yelp

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split

In [None]:
y = yelp.stars
X = yelp.processed_text

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [None]:
pipe = Pipeline(steps=[
    ('vect', CountVectorizer(max_features=5000,ngram_range=(1,2))), 
    ('clf', MultinomialNB()) 
])

In [None]:
pipe.fit(X_train,y_train)

In [None]:
y_test_pred = pipe.predict(X_test)

In [None]:
confusion_matrix(y_test,y_test_pred)

In [None]:
accuracy_score(y_test,y_test_pred)

### How does the model choose between 5-stars or 1-star ratings

In [None]:
# store the vocabulary of X_train
words = pipe['vect'].get_feature_names_out()

In [None]:
pipe['clf'].classes_

In [None]:
# number of times each word appears across all 1-star docs
bad_word_count = pipe['clf'].feature_count_[0,:]
# number of times each word appears across all 5-stars docs
good_word_count = pipe['clf'].feature_count_[1,:]

In [None]:
# create a DataFrame of words with their separate 1-star and 5-stars counts
words = pd.DataFrame({'word' : words,
                      'bad' : bad_word_count, 
                      'good' : good_word_count}).set_index('word')
words.head()

In [None]:
# add 1 to the columns counts to avoid dividing by 0
words.bad = words.bad+1
words.good = words.good+1

In [None]:
# convert the counts into frequencies
words.bad = words.bad/words.bad.sum()
words.good = words.good/words.good.sum()
words.head()

In [None]:
# ratios
words['bad_ratio'] = words.bad/words.good
words['good_ratio'] = words.good/words.bad

In [None]:
words.sort_values(by='good_ratio', ascending=False).head(20)

In [None]:
words.sort_values(by='bad_ratio', ascending=False).head(20)

In [None]:
yelp[yelp.processed_text.str.contains('fedex')].iloc[0].text

In [None]:
print(yelp[yelp.processed_text.str.contains('mozzarella')].iloc[3].text)