# Welcome to the practice notebook
---

Importing modules

In [1]:
import pandas as pd 
import string 

# Importing Natural Language Processing toolkit 
import nltk

# Downloading the NLTK english stop words
nltk.download('stopwords')

# Importing the NLTK english stop words 
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



### Task Description:

In this practical task, we will work with a dataset containing customer reviews for a product. The goal is to load the dataset, clean and preprocess the reviews, and then tokenize the reviews while removing stop words.

Good luck! :)

Let's start from loading the dataset:

In [3]:
data = pd.read_csv("reviews.csv")
data.head()

Unnamed: 0,review
0,Our experience at Rancho Valencia was absolute...
1,Amazing place. Everyone was extremely warm and...
2,We booked a 3 night stay at Rancho Valencia to...
3,Currently in bed writing this for the past hr ...
4,I live in Md and the Aloft is my Home away fro...


Convert all the reviews to lower case <br>

In [4]:
data.review = data.review.str.lower()
data.head()

Unnamed: 0,review
0,our experience at rancho valencia was absolute...
1,amazing place. everyone was extremely warm and...
2,we booked a 3 night stay at rancho valencia to...
3,currently in bed writing this for the past hr ...
4,i live in md and the aloft is my home away fro...


Remove all the punctuations

In [5]:
data.review = data.review.str.translate(str.maketrans('','',string.punctuation))
data.head()

Unnamed: 0,review
0,our experience at rancho valencia was absolute...
1,amazing place everyone was extremely warm and ...
2,we booked a 3 night stay at rancho valencia to...
3,currently in bed writing this for the past hr ...
4,i live in md and the aloft is my home away fro...


Tokenization and removing the stopwords

In [7]:
english_stopwords = stopwords.words('english')
english_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [8]:
def tokenize(review):
    review = str(review)
    tokens = nltk.word_tokenize(review)
    tokens = [t for t in tokens if t not in english_stopwords]
    return tokens

data['tokens'] = data.review.apply(tokenize)
data.head()

Unnamed: 0,review,tokens
0,our experience at rancho valencia was absolute...,"[experience, rancho, valencia, absolutely, per..."
1,amazing place everyone was extremely warm and ...,"[amazing, place, everyone, extremely, warm, we..."
2,we booked a 3 night stay at rancho valencia to...,"[booked, 3, night, stay, rancho, valencia, pla..."
3,currently in bed writing this for the past hr ...,"[currently, bed, writing, past, hr, 12, dogs, ..."
4,i live in md and the aloft is my home away fro...,"[live, md, aloft, home, away, homewe, stayed, ..."
