## NLP Text Preprocessing

#### Data ingestion from API

In [17]:
import requests
import pandas as pd
import re
import os

In [2]:
movie_genres = {}

In [3]:
def get_genre_data(api_url):
    response = requests.get(api_url)
    genre_data = response.json()
    return genre_data

In [4]:
def get_movie_data(api_url):
    all_movie_data = []
    page = 1

    while True:
        response = requests.get(api_url, params={'page': page})
        movie_data = response.json()

        if not movie_data or not movie_data['results']:
            break
        else:
            for single_movie in movie_data['results']:
                movie_details = {}
                movie_details['title'] = single_movie['title']
                movie_details['overview'] = single_movie['overview']
                movie_details['genres'] =', '.join([movie_genres.get(genre_id) for genre_id in single_movie['genre_ids']])
                movie_details['popularity'] = single_movie['popularity']
                all_movie_data.append(movie_details)
            page += 1
    
    return all_movie_data

In [5]:
# Fetch movie genre data from API
genre_api_url = 'https://api.themoviedb.org/3/genre/movie/list?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US'

In [6]:
genre_data = get_genre_data(genre_api_url)
genre_list = genre_data['genres']
movie_genres = { genre['id']: genre['name'] for genre in genre_list}
movie_genres

{28: 'Action',
 12: 'Adventure',
 16: 'Animation',
 35: 'Comedy',
 80: 'Crime',
 99: 'Documentary',
 18: 'Drama',
 10751: 'Family',
 14: 'Fantasy',
 36: 'History',
 27: 'Horror',
 10402: 'Music',
 9648: 'Mystery',
 10749: 'Romance',
 878: 'Science Fiction',
 10770: 'TV Movie',
 53: 'Thriller',
 10752: 'War',
 37: 'Western'}

In [7]:
# Fetch movie data from API
movie_api_url = 'https://api.themoviedb.org/3/movie/top_rated?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US&page=1'

In [8]:
movie_data = get_movie_data(movie_api_url)
movie_data = pd.DataFrame(movie_data)
movie_data.to_csv('artifacts/movie_data.csv', index=False)

#### Load Data

In [40]:
df = pd.read_csv('artifacts/movie_data.csv')
df.head()

Unnamed: 0,title,overview,genres,popularity
0,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"Drama, Crime",146.944
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama, Crime",121.87
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"Drama, Crime",82.27
3,Schindler's List,The true story of how businessman Oskar Schind...,"Drama, History, War",66.67
4,12 Angry Men,The defense and the prosecution have rested an...,Drama,59.231


In [41]:
# Shape
df.shape

(9101, 4)

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9101 entries, 0 to 9100
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       9101 non-null   object 
 1   overview    9100 non-null   object 
 2   genres      9100 non-null   object 
 3   popularity  9101 non-null   float64
dtypes: float64(1), object(3)
memory usage: 284.5+ KB


#### 1. Remove Null Values

In [43]:
# Null Values
df.isnull().sum()

title         0
overview      1
genres        1
popularity    0
dtype: int64

In [44]:
# Remove Null Values
df = df.dropna()
df.isnull().sum()

title         0
overview      0
genres        0
popularity    0
dtype: int64

#### 2. Remove Duplicates Values

In [45]:
# Duplicate Values
df.duplicated().sum()

1

In [46]:
# Removed Duplicate
df = df.drop_duplicates()
df.duplicated().sum()

0

In [47]:
df['overview'][0]

'Framed in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.'

#### 3. Lowercasing

In [48]:
# Lowercasing
df['title'] = df['title'].str.lower()
df['overview'] = df['overview'].str.lower()
df['genres'] = df['genres'].str.lower()

In [49]:
df['genres'][0]

'drama, crime'

In [50]:
# Removing Whitespace
df['title'] = df['title'].str.strip()
df['overview'] = df['overview'].str.strip()
df['genres'] = df['genres'].str.strip()

In [51]:
df['overview'][0]

'framed in the 1940s for the double murder of his wife and her lover, upstanding banker andy dufresne begins a new life at the shawshank prison, where he puts his accounting skills to work for an amoral warden. during his long stretch in prison, dufresne comes to be admired by the other inmates -- including an older prisoner named red -- for his integrity and unquenchable sense of hope.'

#### 4. Remove HTML Tag

In [52]:
# Remove HTML tags
df['title'] = df['title'].str.replace('<.*?>','')
df['overview'] = df['overview'].str.replace('<.*?>','')
df['genres'] = df['genres'].str.replace('<.*?>','')


#### 5. Remove URLs

In [53]:
# Remove URL 
df['title'] = df['title'].str.replace('https?://\S+|www\.\S+','')
df['overview'] = df['overview'].str.replace('https?://\S+|www\.\S+','')
df['genres'] = df['genres'].str.replace('https?://\S+|www\.\S+','')

#### 6. Remove Punctuation

In [54]:
# Remove Punctuation
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [55]:
translator = str.maketrans("", "", string.punctuation)

In [56]:
df['title'] = df['title'].str.translate(translator)
df['overview'] = df['overview'].str.translate(translator)
df['genres'] = df['genres'].str.translate(translator)

#### 7. Chat word treatment

In [57]:
# Chat word treatment
def remove_abb(data):
    data = re.sub(r"he's", "he is", data)
    data = re.sub(r"there's", "there is", data)
    data = re.sub(r"We're", "We are", data)
    data = re.sub(r"That's", "That is", data)
    data = re.sub(r"won't", "will not", data)
    data = re.sub(r"they're", "they are", data)
    data = re.sub(r"Can't", "Cannot", data)
    data = re.sub(r"wasn't", "was not", data)
    data = re.sub(r"don\x89Ûªt", "do not", data)
    data= re.sub(r"aren't", "are not", data)
    data = re.sub(r"isn't", "is not", data)
    data = re.sub(r"What's", "What is", data)
    data = re.sub(r"haven't", "have not", data)
    data = re.sub(r"hasn't", "has not", data)
    data = re.sub(r"There's", "There is", data)
    data = re.sub(r"He's", "He is", data)
    data = re.sub(r"It's", "It is", data)
    data = re.sub(r"You're", "You are", data)
    data = re.sub(r"I'M", "I am", data)
    data = re.sub(r"shouldn't", "should not", data)
    data = re.sub(r"wouldn't", "would not", data)
    data = re.sub(r"i'm", "I am", data)
    data = re.sub(r"I\x89Ûªm", "I am", data)
    data = re.sub(r"I'm", "I am", data)
    data = re.sub(r"Isn't", "is not", data)
    data = re.sub(r"Here's", "Here is", data)
    data = re.sub(r"you've", "you have", data)
    data = re.sub(r"you\x89Ûªve", "you have", data)
    data = re.sub(r"we're", "we are", data)
    data = re.sub(r"what's", "what is", data)
    data = re.sub(r"couldn't", "could not", data)
    data = re.sub(r"we've", "we have", data)
    data = re.sub(r"it\x89Ûªs", "it is", data)
    data = re.sub(r"doesn\x89Ûªt", "does not", data)
    data = re.sub(r"It\x89Ûªs", "It is", data)
    data = re.sub(r"Here\x89Ûªs", "Here is", data)
    data = re.sub(r"who's", "who is", data)
    data = re.sub(r"I\x89Ûªve", "I have", data)
    data = re.sub(r"y'all", "you all", data)
    data = re.sub(r"can\x89Ûªt", "cannot", data)
    data = re.sub(r"would've", "would have", data)
    data = re.sub(r"it'll", "it will", data)
    data = re.sub(r"we'll", "we will", data)
    data = re.sub(r"wouldn\x89Ûªt", "would not", data)
    data = re.sub(r"We've", "We have", data)
    data = re.sub(r"he'll", "he will", data)
    data = re.sub(r"Y'all", "You all", data)
    data = re.sub(r"Weren't", "Were not", data)
    data = re.sub(r"Didn't", "Did not", data)
    data = re.sub(r"they'll", "they will", data)
    data = re.sub(r"they'd", "they would", data)
    data = re.sub(r"DON'T", "DO NOT", data)
    data = re.sub(r"That\x89Ûªs", "That is", data)
    data = re.sub(r"they've", "they have", data)
    data = re.sub(r"i'd", "I would", data)
    data = re.sub(r"should've", "should have", data)
    data = re.sub(r"You\x89Ûªre", "You are", data)
    data = re.sub(r"where's", "where is", data)
    data = re.sub(r"Don\x89Ûªt", "Do not", data)
    data = re.sub(r"we'd", "we would", data)
    data = re.sub(r"i'll", "I will", data)
    data = re.sub(r"weren't", "were not", data)
    data = re.sub(r"They're", "They are", data)
    data = re.sub(r"Can\x89Ûªt", "Cannot", data)
    data = re.sub(r"you\x89Ûªll", "you will", data)
    data = re.sub(r"I\x89Ûªd", "I would", data)
    data = re.sub(r"let's", "let us", data)
    data = re.sub(r"it's", "it is", data)
    data = re.sub(r"can't", "cannot", data)
    data = re.sub(r"don't", "do not", data)
    data = re.sub(r"you're", "you are", data)
    data = re.sub(r"i've", "I have", data)
    data = re.sub(r"that's", "that is", data)
    data = re.sub(r"i'll", "I will", data)
    data = re.sub(r"doesn't", "does not",data)
    data = re.sub(r"i'd", "I would", data)
    data = re.sub(r"didn't", "did not", data)
    data = re.sub(r"ain't", "am not", data)
    data = re.sub(r"you'll", "you will", data)
    data = re.sub(r"I've", "I have", data)
    data = re.sub(r"Don't", "do not", data)
    data = re.sub(r"I'll", "I will", data)
    data = re.sub(r"I'd", "I would", data)
    data = re.sub(r"Let's", "Let us", data)
    data = re.sub(r"you'd", "You would", data)
    data = re.sub(r"It's", "It is", data)
    data = re.sub(r"Ain't", "am not", data)
    data = re.sub(r"Haven't", "Have not", data)
    data = re.sub(r"Could've", "Could have", data)
    data = re.sub(r"youve", "you have", data)  
    data = re.sub(r"donå«t", "do not", data)
    
    return data

In [58]:
df['overview'] = df['overview'].apply(remove_abb)

In [59]:
df.head()

Unnamed: 0,title,overview,genres,popularity
0,the shawshank redemption,framed in the 1940s for the double murder of h...,drama crime,146.944
1,the godfather,spanning the years 1945 to 1955 a chronicle of...,drama crime,121.87
2,the godfather part ii,in the continuing saga of the corleone crime f...,drama crime,82.27
3,schindlers list,the true story of how businessman oskar schind...,drama history war,66.67
4,12 angry men,the defense and the prosecution have rested an...,drama,59.231


#### 8. Spell Correction

In [60]:
# Spell Correction
from textblob import TextBlob

# Example of textblob
text = 'I want to et rce'
TextBlob(text).correct().string

'I want to et re'

In [61]:
def spelling_correction(text):
    return TextBlob(text).correct().string

In [62]:
# df['overview'] = df['overview'].apply(spelling_correction)

#### 9. Remove Stopwords

In [63]:
# Remove Stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [74]:
def remove_stopwords(text):
    words = [word for word in word_tokenize(text) if word not in stopwords.words('english')]
    return ' '.join(words).strip()

In [78]:
df['title'] = df['title'].apply(remove_stopwords)
df['overview'] = df['overview'].apply(remove_stopwords)
df['genres'] = df['genres'].apply(remove_stopwords)

In [79]:
df['overview']

0       framed 1940s double murder wife lover upstandi...
1       spanning years 1945 1955 chronicle fictional i...
2       continuing saga corleone crime family young vi...
3       true story businessman oskar schindler saved t...
4       defense prosecution rested jury filing jury ro...
                              ...                        
9096    filmmaking team behind hits scary movie date m...
9097    year 3000 man match psychlos greedy manipulati...
9098    set island coast techno rave party attracts di...
9099    18th birthday goku receives mystical dragonbal...
9100    platoon eagles vultures attacks residents smal...
Name: overview, Length: 9098, dtype: object

#### 10. Handling Emojis

In [14]:
import emoji

text = "I love Python! 😊🐍 #programming"

# Extract emojis
emojis = [c for c in text if c in emoji.EMOJI_DATA]
print(emojis)


['😊', '🐍']


In [15]:
# Remove Emojis
clean_text = ''.join([c for c in text if c not in emoji.EMOJI_DATA])
print(clean_text)

I love Python!  #programming


In [16]:
# Replace emojis with textual representations
text_with_replacements = emoji.demojize(text)
text_with_replacements

'I love Python! :smiling_face_with_smiling_eyes::snake: #programming'

In [28]:
# Apply on dataFrame
df_emoji = pd.read_csv(os.path.join('dataset','Emoji','Movies_to_Emojis.csv'))
df_emoji.head()

Unnamed: 0,Movies,Emojis
0,Titanic,⛴️🌊❄️💔👫
1,The Lion King,🦁👑🎶
2,Avatar,🌍🌳💙👽
3,The Dark Knight,🌑🦇🌆🃏
4,The Shawshank Redemption,🗝️🤝👨⚖️


In [29]:
df_emoji['Emoji to text'] = df_emoji['Emojis'].apply(emoji.demojize)
df_emoji.head()

Unnamed: 0,Movies,Emojis,Emoji to text
0,Titanic,⛴️🌊❄️💔👫,:ferry::water_wave::snowflake::broken_heart::w...
1,The Lion King,🦁👑🎶,:lion::crown::musical_notes:
2,Avatar,🌍🌳💙👽,:globe_showing_Europe-Africa::deciduous_tree::...
3,The Dark Knight,🌑🦇🌆🃏,:new_moon::bat::cityscape_at_dusk::joker:
4,The Shawshank Redemption,🗝️🤝👨⚖️,:old_key::handshake::man::balance_scale:


#### 11. Tokenization

In [31]:
# NLTK Tokenization
from nltk.tokenize import word_tokenize
word_tokenize(text)

['I', 'love', 'Python', '!', '😊🐍', '#', 'programming']

In [33]:
# Spacy Tokenization
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(text) 
tokens = [token.text for token in doc]
print(tokens)

['I', 'love', 'Python', '!', '😊', '🐍', '#', 'programming']


In [35]:
# Sentence-wise tokenization
from nltk.tokenize import sent_tokenize

text = "Tokenization is important. It helps break text into smaller units. Sentences are a common choice for tokenization in NLP."
sentences = sent_tokenize(text)
sentence_tokens = [word_tokenize(sentence) for sentence in sentences]

print(sentences)
print(sentence_tokens)

['Tokenization is important.', 'It helps break text into smaller units.', 'Sentences are a common choice for tokenization in NLP.']
[['Tokenization', 'is', 'important', '.'], ['It', 'helps', 'break', 'text', 'into', 'smaller', 'units', '.'], ['Sentences', 'are', 'a', 'common', 'choice', 'for', 'tokenization', 'in', 'NLP', '.']]


#### 12. Stemming

In [38]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

text = "Stemming is a process of reducing words to their base form."

words = word_tokenize(text)
porter_stemmer = PorterStemmer()

stemmed_words = [porter_stemmer.stem(word) for word in words]

print('Original words',words)
print('Stemmed words',stemmed_words)

Original words ['Stemming', 'is', 'a', 'process', 'of', 'reducing', 'words', 'to', 'their', 'base', 'form', '.']
Stemmed words ['stem', 'is', 'a', 'process', 'of', 'reduc', 'word', 'to', 'their', 'base', 'form', '.']


#### 13. Lemmatization

In [43]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

text = "Lemmatization is a process of reducing words to their base form."

words = word_tokenize(text)
lemmatizer = WordNetLemmatizer()

lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]

print('Original words',words)
print('Stemmed words',lemmatized_words)

Original words ['Lemmatization', 'is', 'a', 'process', 'of', 'reducing', 'words', 'to', 'their', 'base', 'form', '.']
Stemmed words ['Lemmatization', 'be', 'a', 'process', 'of', 'reduce', 'word', 'to', 'their', 'base', 'form', '.']
