

# Text PreProcessing 

In [13]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [14]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 97% 25.0M/25.7M [00:01<00:00, 33.4MB/s]
100% 25.7M/25.7M [00:01<00:00, 21.7MB/s]


In [15]:

import zipfile
zip_ref = zipfile.ZipFile('/content/imdb-dataset-of-50k-movie-reviews.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [16]:
import pandas as pd

In [17]:
df=pd.read_csv("/content/IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [18]:
df["review"][5]   # here is the 5th number review

'Probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. It just never gets old, despite my having seen it some 15 or more times in the last 25 years. Paul Lukas\' performance brings tears to my eyes, and Bette Davis, in one of her very few truly sympathetic roles, is a delight. The kids are, as grandma says, more like "dressed-up midgets" than children, but that only makes them more fun to watch. And the mother\'s slow awakening to what\'s happening in the world and under her own roof is believable and startling. If I had a dozen thumbs, they\'d all be "up" for this movie.'

## 1. Lower-Casing

In [19]:
df["review"][5].lower()   # we will apply a lower inbuilt function to convert in lower-case

'probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. it just never gets old, despite my having seen it some 15 or more times in the last 25 years. paul lukas\' performance brings tears to my eyes, and bette davis, in one of her very few truly sympathetic roles, is a delight. the kids are, as grandma says, more like "dressed-up midgets" than children, but that only makes them more fun to watch. and the mother\'s slow awakening to what\'s happening in the world and under her own roof is believable and startling. if i had a dozen thumbs, they\'d all be "up" for this movie.'

In [20]:
df["review"].str.lower()      # now convert the full review into lower-case

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. <br /><br />the...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

### 2.Remove html Tags

In [21]:
df["review"]=df["review"].str.lower()  # now update the value of df

In [22]:
import re

In [23]:
def remove_html_tags(text):
  pattern=re.compile(r'<.*?>') 
  return pattern.sub(r" ",text)

In [24]:
df["review"].apply(remove_html_tags)

0        one of the other reviewers has mentioned that ...
1        a wonderful little production.   the filming t...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

### Remove URLs

In [25]:
df["review"]=df["review"].apply(remove_html_tags)

In [26]:
def remove_urls(text):
    url_pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return re.sub(url_pattern," ", text)

In [27]:
df["review"].apply(remove_urls)

0        one of the other reviewers has mentioned that ...
1        a wonderful little production.   the filming t...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

In [28]:
df["review"]=df["review"].apply(remove_urls)

### Remove Punctuations

In [29]:

def remove_punctuation(text):
    punctuation_pattern = re.compile(r'[^\w\s]')
    return re.sub(punctuation_pattern, '', text)

In [30]:
df["review"].apply(remove_punctuation)

0        one of the other reviewers has mentioned that ...
1        a wonderful little production   the filming te...
2        i thought this was a wonderful way to spend ti...
3        basically theres a family where a little boy j...
4        petter matteis love in the time of money is a ...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    i am a catholic taught in parochial elementary...
49998    im going to have to disagree with the previous...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

In [31]:
df["review"]=df["review"].apply(remove_punctuation)

### Spelling Cheaker

In [None]:
from textblob import TextBlob

def correct_spelling(text):
    text_blob = TextBlob(text)
    corrected_text = str(text_blob.correct())
    return corrected_text

In [None]:
df["review"].apply(correct_spelling)

In [None]:
df["review"]=df["review"].apply(correct_spelling)

### Remove Stop Words

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def remove_stop_words(text):
    stop_words = set(stopwords.words("english"))
    words = nltk.word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

In [None]:
df["review"]=df["review"].apply(remove_stop_words)