## Data Preprocessing

#### Sample Dataset used 
https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [None]:
import pandas as pd

df=pd.read_csv("IMDB Dataset.csv")
df.head()

### 1. Lower Case

In [None]:
df["review"]=df["review"].str.lower()
df.head()

In [None]:
df["review"][2]

#### 2. Remove HTML Tags

In [None]:
import re

def remove_html_tag(text):
    pattern=re.compile(r"<.*?>")
    return re.sub(pattern,r"",text)

In [None]:
df['review'] = df['review'].apply(remove_html_tag)
df.head()

#### 3. Remove URL

In [None]:
def remove_url(text):
    pattern=re.compile(r"https?://\S+|www\.\S+")
    return re.sub(pattern,r"",text)

In [None]:
df['review']=df['review'].apply(remove_url)
df.head()

#### 4. Remove Punctuation

In [None]:
import string

exclude = string.punctuation
exclude

In [None]:
# method 1: Time consuming
def remove_punctuation(text):
    for char in exclude:
        text=text.replace(char,r"")
    return text

In [None]:
# method 2: Prefered
def remove_punc(text):
    return text.translate(str.maketrans('','',exclude))

In [None]:
df['review']=df['review'].apply(remove_punctuation)
df.head()

#### 5. Chat Conversion Handling

In [None]:
chat_words = {
    'AFAIK':'As Far As I Know',
    'AFK':'Away From Keyboard',
    'ASAP':'As Soon As Possible',
    "FYI": "For Your Information",
    "ASAP": "As Soon As Possible",
    "BRB": "Be Right Back",
    "BTW": "By The Way",
    "OMG": "Oh My God",
    "IMO": "In My Opinion",
    "LOL": "Laugh Out Loud",
    "TTYL": "Talk To You Later",
    "GTG": "Got To Go",
    "TTYT": "Talk To You Tomorrow",
    "IDK": "I Don't Know",
    "TMI": "Too Much Information",
    "IMHO": "In My Humble Opinion",
    "ICYMI": "In Case You Missed It",
    "AFAIK": "As Far As I Know",
    "BTW": "By The Way",
    "FAQ": "Frequently Asked Questions",
    "TGIF": "Thank God It's Friday",
    "FYA": "For Your Action",
    "ICYMI": "In Case You Missed It",
}

In [None]:
def chat_conversion(text):
    new_text=[]
    for word in text.split():
        if word.upper() in chat_words:
            new_text.append(chat_words[word.upper()])
        else:
            new_text.append(word)
    return " ".join(new_text)

In [None]:
df['review']=df['review'].apply(chat_conversion)
df.head()

#### 6. Spelling Correction/Incorrect text handling

In [None]:
from textblob import TextBlob

In [None]:
incorrect_text = 'ceertain conditionas duriing seveal ggenerations aree moodified in the saame maner.'

textBlb = TextBlob(incorrect_text)

textBlb.correct().string

#### 7. Remove Stopwords

In [None]:
!pip install nltk

In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

In [None]:
def remove_stopwords(text):
    new_text=[]
    
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [None]:
df['review'].apply(remove_stopwords)
df.head()

#### 8. Remove emoji

In [None]:
# method 1:
import re

def remove_emoji(text):
    pattern=re.compile("["
                        u"\U0001F600-\U0001F64F"  # emoticons
                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251"
                        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'',text)

In [None]:
df['review']=df['review'].apply(remove_emoji)

In [None]:
# method 2:

!pip install emoji

In [None]:
import emoji
print(emoji.demojize('Python is 🔥'))

### 9. Tokenization

#### a. Using Spilt Function

In [None]:
# word tokenization
df['review']=df['review'].str.split()
df.head()

In [None]:
# sentence tokenization
df['review']=df['review'].str.split('.')
df.head()

#### b. Using Regular Expression

In [None]:
# word tokenization
import re
sent3 = 'I am going to delhi!'
tokens = re.findall("[\w']+", sent3)
tokens

In [None]:
# sentence tokenization
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry?
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""
sentences = re.compile('[.!?] ').split(text)
sentences

#### c. NLTK

In [1]:
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# word tokenization
sent1 = 'I am going to visit delhi!'
word_tokenize(sent1)

In [None]:
# sentence tokenization
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry?
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""
sent_tokenize(text)

#### d. Spacy

In [None]:
import spacy

nlp=spacy.load('en_core_web_sm')
doc=nlp(text)
token_list=[token.text for token in doc]

for token in doc:
    print(token.text, token.pos_, token.dep_)

#### e. Transformer based 

In [None]:
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer.tokenize("Text")

In [None]:
tokenizer=XLNetTokenizer.from_pretrained('xlnet-base-cased')
tokenizer.tokenize("Text")

### 10. Stemmer

In [None]:
from nltk.stem.porter import PorterStemmer

In [None]:
ps=PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [None]:
text="Text"
stem_word(sample)

### 11. Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

wordnet_lemmatizer=WordNetLemmatizer()
text="He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations="?:!.,;"
tokens=nltk.word_tokenize(text)

for word in tokens:
    if word in punctuations:
        tokens.remove(word)
        
print("{0:20}{1:20}".format("Word","Lemma"))
for word in sentence_words:
    print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word,pos='v')))