In [1]:
import numpy as np
import pandas as pd
import re
import string
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.porter import PorterStemmer

In [2]:
df = pd.read_csv('IMDB_Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df['review'].str.lower()

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. <br /><br />the...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

In [4]:

def remove_html_tags(reviews):
    return re.sub(r'<[^<]+?>', '', reviews)

df['review'] = df['review'].apply(remove_html_tags)

In [5]:
df['review'].str.contains('html')

0        False
1        False
2        False
3        False
4        False
         ...  
49995    False
49996    False
49997    False
49998    False
49999    False
Name: review, Length: 50000, dtype: bool

In [6]:
def remove_url(text):
    return re.sub(r'http[s]?://\S+|www\.\S+', '', text)

In [7]:
df['review'] = df['review'].apply(remove_url)

In [8]:
#Remove punctuation¶
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
exclude  = string.punctuation

In [10]:
def remove_punctuation(text):
    for char in exclude:
        text = text.replace(char, '')
    return text 

In [11]:
text = "string. with. punctuation?"
print(remove_punctuation(text))

string with punctuation


In [12]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

remove_punctuation("string. with. punctuation?")

'string with punctuation'

In [13]:
df['review'] = df['review'].apply(remove_punctuation)
df['review'][1]

'A wonderful little production The filming technique is very unassuming very oldtimeBBC fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece The actors are extremely well chosen Michael Sheen not only has got all the polari but he has all the voices down pat too You can truly see the seamless editing guided by the references to Williams diary entries not only is it well worth the watching but it is a terrificly written and performed piece A masterful production about one of the great masters of comedy and his life The realism really comes home with the little things the fantasy of the guard which rather than use the traditional dream techniques remains solid then disappears It plays on our knowledge and our senses particularly with the scenes concerning Orton and Halliwell and the sets particularly of their flat with Halliwells murals decorating every surface are terribly well done'

Spelling correction

In [14]:
def spelling_correction(text):
    return TextBlob(text).correct()

incorrect_text = 'certains contion durinig sevveral ggeneration aree modifyy in the sammee mannere'

print(spelling_correction(incorrect_text))

certains condition during several generation are modify in the same manner


Removing stop words

In [15]:
stop_words = set(stopwords.words('english'))
len(stop_words)
# stop_words

179

In [16]:
def remove_stopwords(text):
    return [word for word in text.split() if word not in stop_words]

# def remove_stopwords(text):
#     lst = []
#     for word in text.split():
#         if word not in stop_words:
#             lst.append(word)
#     return " ".join(lst)

text = 'Natural language processing is an interdisciplinary subfield of computer science and information retrieval. It is primarily concerned with giving computers the ability to support and manipulate human language'
remove_stopwords(text)

['Natural',
 'language',
 'processing',
 'interdisciplinary',
 'subfield',
 'computer',
 'science',
 'information',
 'retrieval.',
 'It',
 'primarily',
 'concerned',
 'giving',
 'computers',
 'ability',
 'support',
 'manipulate',
 'human',
 'language']

In [17]:
df['review'] = df['review'].apply(remove_stopwords)
df['review'][1]


# df['review'].apply(lambda x: len(x.split()) - len(remove_stopwords(x).split()))

['A',
 'wonderful',
 'little',
 'production',
 'The',
 'filming',
 'technique',
 'unassuming',
 'oldtimeBBC',
 'fashion',
 'gives',
 'comforting',
 'sometimes',
 'discomforting',
 'sense',
 'realism',
 'entire',
 'piece',
 'The',
 'actors',
 'extremely',
 'well',
 'chosen',
 'Michael',
 'Sheen',
 'got',
 'polari',
 'voices',
 'pat',
 'You',
 'truly',
 'see',
 'seamless',
 'editing',
 'guided',
 'references',
 'Williams',
 'diary',
 'entries',
 'well',
 'worth',
 'watching',
 'terrificly',
 'written',
 'performed',
 'piece',
 'A',
 'masterful',
 'production',
 'one',
 'great',
 'masters',
 'comedy',
 'life',
 'The',
 'realism',
 'really',
 'comes',
 'home',
 'little',
 'things',
 'fantasy',
 'guard',
 'rather',
 'use',
 'traditional',
 'dream',
 'techniques',
 'remains',
 'solid',
 'disappears',
 'It',
 'plays',
 'knowledge',
 'senses',
 'particularly',
 'scenes',
 'concerning',
 'Orton',
 'Halliwell',
 'sets',
 'particularly',
 'flat',
 'Halliwells',
 'murals',
 'decorating',
 'every',

In [18]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               # emoticons
                               u"\U0001F300-\U0001F5FF"
                               # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"
                               # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"
                               # flags (105)
                               u"\U00002702-\U00002780"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [19]:
text_with_emoji = "Hello! 😀 This is a sample text with emoji. "
text_without_emoji = remove_emoji(text_with_emoji)
print(text_without_emoji) 

Hello!  This is a sample text with emoji. . Today i am happy 


In [22]:
# Tokenization
sent1 = 'Lower casing: Converting a word to lower case (NLP -> nlp). Words like Book and book mean the same but when not converted to the lower case those two are represented as two different words in the vector space model (resulting in more dimensions). Output: books are on the table.'
print(sent_tokenize(sent1))
print(word_tokenize(sent1))

['Lower casing: Converting a word to lower case (NLP -> nlp).', 'Words like Book and book mean the same but when not converted to the lower case those two are represented as two different words in the vector space model (resulting in more dimensions).', 'Output: books are on the table.']
['Lower', 'casing', ':', 'Converting', 'a', 'word', 'to', 'lower', 'case', '(', 'NLP', '-', '>', 'nlp', ')', '.', 'Words', 'like', 'Book', 'and', 'book', 'mean', 'the', 'same', 'but', 'when', 'not', 'converted', 'to', 'the', 'lower', 'case', 'those', 'two', 'are', 'represented', 'as', 'two', 'different', 'words', 'in', 'the', 'vector', 'space', 'model', '(', 'resulting', 'in', 'more', 'dimensions', ')', '.', 'Output', ':', 'books', 'are', 'on', 'the', 'table', '.']


In [21]:
# Stemming
ps = PorterStemmer()
def stem_words(text):
    
    return " ".join([ps.stem(word) for word in word_tokenize(text)])
sample = 'walk walking walked'
stem_words(sample)

'walk walk walk'