In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from fuzzywuzzy import fuzz
import matplotlib.pyplot as plt
import scipy.stats as st

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yixin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yixin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yixin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

A. Using the **McDonalds Yelp Review CSV file**, **process the reviews**.
This means you should think briefly about:
* what stopwords to remove (should you add any custom stopwords to the set? Remove any stopwords?)
* what regex cleaning you may need to perform (for example, are there different ways of saying `hamburger` that you need to account for?)
* stemming/lemmatization (explain in your notebook why you used stemming versus lemmatization). 

Next, **count-vectorize the dataset**. Use the **`sklearn.feature_extraction.text.CountVectorizer`** examples from `Linear Algebra, Distance and Similarity (Completed).ipynb` and `Text Preprocessing Techniques (Completed).ipynb` (read the last section, `Vectorization Techniques`).

I do not want redundant features - for instance, I do not want `hamburgers` and `hamburger` to be two distinct columns in your document-term matrix. Therefore, I'll be taking a look to make sure you've properly performed your cleaning, stopword removal, etc. to reduce the number of dimensions in your dataset. 

In [2]:
stopword_list = stopwords.words('english')

In [3]:
negative_review_df = pd.read_csv('mcdonalds-yelp-negative-reviews.csv', encoding = 'latin-1')
negative_review_df['lower_case'] = negative_review_df['review'].str.lower()
negative_review_df['timestamp'] = negative_review_df['lower_case'].str.replace(
    r'(?:[0-1][0-9]:[0-5][0-9])|(?:[0-1]?[0-9]?:?[0-5]?[0-9](?:ish)?\s?(?:am|pm))','TIMESTAMP_TOKEN')
negative_review_df['stopword'] = negative_review_df['timestamp'].str.replace(
    r'\b('+'|'.join(stopword_list)+r')\b','')
negative_review_df['word_list'] = negative_review_df['stopword'].apply(word_tokenize)

In [4]:
punctuation_list = set(negative_review_df['stopword'].str.findall(r'['+string.punctuation+r']+').explode())

In [6]:
stemmer = PorterStemmer()
negative_review_df['stem'] = negative_review_df['word_list'].apply(lambda x: [stemmer.stem(word) for word in x if word not in punctuation_list])
negative_review_df['join'] = negative_review_df['stem'].apply(lambda x: ' '.join(x))
negative_review_df

Unnamed: 0,_unit_id,city,review,lower_case,timestamp,stopword,word_list,stem,join
0,679455653,Atlanta,"I'm not a huge mcds lover, but I've been to be...","i'm not a huge mcds lover, but i've been to be...","i'm not a huge mcds lover, but i've been to be...","' huge mcds lover, ' better ones. far ...","[', huge, mcds, lover, ,, ', better, ones, ., ...","[huge, mcd, lover, better, one, far, worst, on...",huge mcd lover better one far worst one ever f...
1,679455654,Atlanta,Terrible customer service. I came in at 9:30pm...,terrible customer service. i came in at 9:30pm...,terrible customer service. i came in at TIMEST...,terrible customer service. came TIMESTAMP_T...,"[terrible, customer, service, ., came, TIMESTA...","[terribl, custom, servic, came, timestamp_toke...",terribl custom servic came timestamp_token sto...
2,679455655,Atlanta,"First they ""lost"" my order, actually they gave...","first they ""lost"" my order, actually they gave...","first they ""lost"" my order, actually they gave...","first ""lost"" order, actually gave someone...","[first, ``, lost, '', order, ,, actually, gave...","[first, ``, lost, order, actual, gave, someon,...",first `` lost order actual gave someon one els...
3,679455656,Atlanta,I see I'm not the only one giving 1 star. Only...,i see i'm not the only one giving 1 star. only...,i see i'm not the only one giving 1 star. only...,see ' one giving 1 star. -25 star!!!...,"[see, ', one, giving, 1, star, ., -25, star, !...","[see, one, give, 1, star, -25, star, need, say]",see one give 1 star -25 star need say
4,679455657,Atlanta,"Well, it's McDonald's, so you know what the fo...","well, it's mcdonald's, so you know what the fo...","well, it's mcdonald's, so you know what the fo...","well, ' mcdonald', know food . review ref...","[well, ,, ', mcdonald, ', ,, know, food, ., re...","[well, mcdonald, know, food, review, reflect, ...",well mcdonald know food review reflect sole po...
...,...,...,...,...,...,...,...,...,...
1520,679500008,Portland,I enjoyed the part where I repeatedly asked if...,i enjoyed the part where i repeatedly asked if...,i enjoyed the part where i repeatedly asked if...,enjoyed part repeatedly asked right sa...,"[enjoyed, part, repeatedly, asked, right, sauc...","[enjoy, part, repeatedli, ask, right, sauc, 4,...",enjoy part repeatedli ask right sauc 4 time fu...
1521,679500224,Houston,Worst McDonalds I've been in in a long time! D...,worst mcdonalds i've been in in a long time! d...,worst mcdonalds i've been in in a long time! d...,worst mcdonalds ' long time! dirt everywhe...,"[worst, mcdonalds, ', long, time, !, dirt, eve...","[worst, mcdonald, long, time, dirt, everywher,...",worst mcdonald long time dirt everywher food b...
1522,679500608,New York,"When I am really craving for McDonald's, this ...","when i am really craving for mcdonald's, this ...","when i am really craving for mcdonald's, this ...","really craving mcdonald', seems closes...","[really, craving, mcdonald, ', ,, seems, close...","[realli, crave, mcdonald, seem, closest, big, ...",realli crave mcdonald seem closest big fan fas...
1523,679501257,Chicago,Two points right out of the gate: 1. Thuggery ...,two points right out of the gate: 1. thuggery ...,two points right out of the gate: 1. thuggery ...,two points right gate: 1. thuggery knows r...,"[two, points, right, gate, :, 1., thuggery, kn...","[two, point, right, gate, 1., thuggeri, know, ...",two point right gate 1. thuggeri know race lil...


In [7]:
vectorizer = CountVectorizer(min_df=2)
X = vectorizer.fit_transform(negative_review_df['join'])
X = X.toarray()
corpus_df = pd.DataFrame(X, columns=vectorizer.get_feature_names())
corpus_df

Unnamed: 0,00,05,08,09,10,100,10th,11,12,13,...,york,young,younger,yuck,yum,yummi,yup,zero,zombi,î_
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1520,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1521,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1522,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1523,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


**Answer:**

1. Read the data and lowercase 
2. Replace timestamp value by Regular Expression
3. Stopword removal
4. Stemming because I want to lower the number of features more.
5. Vectorization

B. **Stopwords, Stemming, Lemmatization Practice**

Using the `tale-of-two-cities.txt` file from Week 1:
* Count-vectorize the corpus. Treat each sentence as a document.

How many features (dimensions) do you get when you:
* Perform **stemming and then count-vectorization
* Perform **lemmatization** and then **count-vectorization**.
* Perform **lemmatization**, remove **stopwords**, and then perform **count-vectorization**?

In [8]:
text = open('tale-of-two-cities.txt', "r", encoding='utf8').read().replace('\n',' ')
sent_text = nltk.sent_tokenize(text) # this gives us a list of sentences
word_list_sent = [word_tokenize(sent) for sent in sent_text]

In [9]:
stemmer = PorterStemmer()
stem_only = []
for sent in word_list_sent:
    stem_only.append([stemmer.stem(word) for word in sent])

In [10]:
lemmatizer = WordNetLemmatizer()
lemma_only = []
for sent in word_list_sent:
    lemma_only.append([lemmatizer.lemmatize(word) for word in sent])

In [11]:
stopword_list = stopwords.words('english')
lemma_stop = []
for sent in word_list_sent:
    lemma_stop.append([lemmatizer.lemmatize(word) for word in sent if word not in stopword_list])

In [12]:
def get_num_features(word_list_sent):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform([' '.join(sent) for sent in word_list_sent])
    return len(vectorizer.get_feature_names())

In [13]:
get_num_features(stem_only)

6659

In [14]:
get_num_features(lemma_only)

8910

In [15]:
get_num_features(lemma_stop)

8897

**Answer: As we can see here, stemming have the lowest number of features, while lemmatization have much more. Also removing stopword will decrease the number a little bit**