In [1]:
import numpy as np
import pandas as pd

In [2]:
imdb=pd.read_csv('/content/IMDB Dataset.csv')

In [3]:
imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
#1 change it to lower case
imdb['review'][3].lower()

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [5]:
imdb['review'].str.lower()

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. <br /><br />the...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

In [6]:
#2 Remove HTML tag
import re
def remove_html_tag(text):
  pattern=re.compile('<.*?>')
  return pattern.sub(r"",text)

In [7]:
text='<p>dcdcbdshbsdhc <br></br>ziadncc <br>'
remove_html_tag(text)

'dcdcbdshbsdhc ziadncc '

In [8]:
imdb['review']=imdb['review'].apply(remove_html_tag)

In [9]:
#3 Remove URl Link
import re
def remove_url(text):
  pattern=re.compile(r'https?://\S+|www\.\S+')
  return pattern.sub(r"",text)


In [10]:
text1="Check out my website https://www.w3schools.com/python/python_regex.asp"

In [11]:
remove_url(text1)

'Check out my website '

In [12]:
#4 removing punctuation
import string
exclude=string.punctuation

In [13]:

def remove_punc(text):
  return text.translate(str.maketrans('','',exclude))

In [14]:
imdb['review'].apply(remove_punc)

0        One of the other reviewers has mentioned that ...
1        A wonderful little production The filming tech...
2        I thought this was a wonderful way to spend ti...
3        Basically theres a family where a little boy J...
4        Petter Matteis Love in the Time of Money is a ...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot bad dialogue bad acting idiotic direc...
49997    I am a Catholic taught in parochial elementary...
49998    Im going to have to disagree with the previous...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [15]:
#5 chatword treatment like hru ...> how are you
chatwords={'ASAP':'as soon as possible','HRU':'How are you'}

In [19]:
def chat_conversion(text):
  new_text=[]
  for w in text.split():
    if w.upper() in chatwords:
      new_text.append(chatwords[w.upper()])
    else:
      new_text.append(w)
  return " ".join(new_text)

In [20]:
chat_conversion('HRU man')

'How are you man'

In [21]:
from textblob import TextBlob

In [22]:
incorrecttext='certaain conditioons duirring seveal ggenration aree moodified'
textblb=TextBlob(incorrecttext)
textblb.correct().string

'certain conditions during several generation are modified'

In [37]:
#6 Stopwords
import spacy

from spacy.lang.en.stop_words import STOP_WORDS

len(STOP_WORDS)

326

In [38]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("We just opened our wings, the flying part is coming soon")

for token in doc:
    if token.is_stop:
        print(token)

We
just
our
the
part
is


In [39]:
def preprocess(text):
    doc = nlp(text)
    
    no_stop_words = [token.text for token in doc if not token.is_stop]
    return " ".join(no_stop_words) 

In [40]:
preprocess("Musk wants time to prepare for a trial over his")

'Musk wants time prepare trial'

In [41]:
print(STOP_WORDS)

{'name', 'otherwise', 'hers', 'both', 'seeming', 'before', 'every', 'under', 'it', 'nor', 'also', 'been', 'seemed', 'of', 'them', 'last', 'until', 'anyone', 'one', 'hereafter', 'themselves', 'elsewhere', 'few', 'on', '‘ll', 'does', 'than', 'then', 'hence', 'call', 'but', 'first', 'n‘t', 'his', 'after', 'much', 'many', 'your', 'except', 'was', 'whoever', 'or', 'once', 'per', 'yourself', 'its', 'front', 'wherever', 'serious', 'now', 'nine', 'the', 'almost', 'yourselves', 'be', 'back', 'sixty', 'doing', 'for', 'beyond', 'will', 'among', 'somehow', 'across', 'whatever', 'me', 'therein', 'five', 'full', 'less', '’ve', 'us', 'seem', 'everywhere', 'cannot', 'please', 'twelve', 'anyhow', 'various', 'third', 'this', 'thence', 'might', 'to', 'can', '’m', 'so', 'give', 'afterwards', 'during', 'done', 'have', '‘re', 'those', 'least', 'were', 'we', 'beside', 'unless', 'in', 'side', 'something', 'others', 'nevertheless', 'has', 'into', 'when', 'am', 'my', 'around', 'move', 'between', 'itself', 'alwa

In [42]:
#7 Handling Emojis
import re
def remove_emoji(text):
  emoji_pattern=re.compile("["u"\U0001F600-\U0001F64F"
                            u"\U0001F300-\U0001F5FF"
                            u"\U0001F680-\U0001F6FF"
                            "]+",flags=re.UNICODE)
  return emoji_pattern.sub(r'',text)

In [43]:
remove_emoji('Loved the movie.It was 😘')

'Loved the movie.It was '

In [47]:
pip install emoji --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-2.2.0.tar.gz (240 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.9/240.9 KB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.2.0-py3-none-any.whl size=234926 sha256=f9e6a6c428773167738d710d0fb55879aa8439482158c3e5c0e046b3effd3ac5
  Stored in directory: /root/.cache/pip/wheels/86/62/9e/a6b27a681abcde69970dbc0326ff51955f3beac72f15696984
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-2.2.0


In [48]:
import emoji
print(emoji.demojize('python is 😘'))

python is :face_blowing_a_kiss:


In [49]:
# Tokenization
#word tokenization
sent1='I am going to Peshawar'
sent1.split()

['I', 'am', 'going', 'to', 'Peshawar']

In [50]:
#senetence Tokinazation
sent2='I am going to Peshawar.I will stay there for 3 days.let see what happens'
sent2.split('.')

['I am going to Peshawar',
 'I will stay there for 3 days',
 'let see what happens']

In [53]:
# use library for tokenization
import nltk
from nltk.tokenize  import word_tokenize,sent_tokenize

In [55]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [56]:
sent3='I am going to visit Peshawar!'
word_tokenize(sent3)

['I', 'am', 'going', 'to', 'visit', 'Peshawar', '!']

In [57]:
text="Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged."

In [58]:
sent_tokenize(text)

['Lorem Ipsum is simply dummy text of the printing and typesetting industry.',
 "Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book.",
 'It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged.']

In [61]:
sent6='I have a Phd in AI'


In [59]:
# Best Library for tokenization is Spacy
import spacy
nlp=spacy.load('en_core_web_sm')

In [62]:
doc1=nlp(sent6)

In [63]:
for token in doc1:
  print(token)

I
have
a
Phd
in
AI


In [64]:
#Stemming means convert word to root 
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
def stem_words(text):
  return " ".join([ps.stem(word) for word in text.split()])

In [65]:
sample='walk walks walking walked'
stem_words(sample)

'walk walk walk walk'

In [66]:
s2='Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book.'
print(s2)

Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book.


In [67]:
stem_words(s2)

'lorem ipsum is simpli dummi text of the print and typeset industry. lorem ipsum ha been the industri standard dummi text ever sinc the 1500s, when an unknown printer took a galley of type and scrambl it to make a type specimen book.'

In [76]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [78]:
import nltk
from nltk.stem import WordNetLemmatizer
wordnetlemmitizer=WordNetLemmatizer()
sentence="Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book."
punctuations='?:!..;'
sentence_words=nltk.word_tokenize(sentence)
for word in sentence_words:
  if word in punctuations:
    sentence_words.remove(word)
sentence_words
print("{0:20}{1:20}".format('Word',"Lemma"))
for word in sentence_words:
  print("{0:20}{1:20}".format(word,wordnetlemmitizer.lemmatize(word,pos='v')))

Word                Lemma               
Lorem               Lorem               
Ipsum               Ipsum               
is                  be                  
simply              simply              
dummy               dummy               
text                text                
of                  of                  
the                 the                 
printing            print               
and                 and                 
typesetting         typeset             
industry            industry            
Lorem               Lorem               
Ipsum               Ipsum               
has                 have                
been                be                  
the                 the                 
industry            industry            
's                  's                  
standard            standard            
dummy               dummy               
text                text                
ever                ever                
since           