<a href="https://colab.research.google.com/github/torquerxf/campusx-nlp-follow/blob/main/multiclass_classification_textpreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
1. Create data
2. Text-preprocessing

data:
name | description | genre
--- | --- | --- |

create data using tmdb api
"""

In [None]:
import requests

url = "https://imdb236.p.rapidapi.com/api/imdb/top250-movies"

headers = {
	"x-rapidapi-key": "da4a2976f6msh1c683e38a2bffcep137a97jsn2d291e99273c",
	"x-rapidapi-host": "imdb236.p.rapidapi.com"
}

response = requests.get(url, headers=headers)

print(response.json())

[{'id': 'tt0111161', 'url': 'https://www.imdb.com/title/tt0111161/', 'primaryTitle': 'The Shawshank Redemption', 'originalTitle': 'The Shawshank Redemption', 'type': 'movie', 'description': 'A banker convicted of uxoricide forms a friendship over a quarter century with a hardened convict, while maintaining his innocence and trying to remain hopeful through simple compassion.', 'primaryImage': 'https://m.media-amazon.com/images/M/MV5BMDAyY2FhYjctNDc5OS00MDNlLThiMGUtY2UxYWVkNGY2ZjljXkEyXkFqcGc@.jpg', 'thumbnails': [{'url': 'https://m.media-amazon.com/images/M/MV5BMDAyY2FhYjctNDc5OS00MDNlLThiMGUtY2UxYWVkNGY2ZjljXkEyXkFqcGc@._V1_QL75_UX100_CR0,0,100,148_.jpg', 'width': 100, 'height': 148}, {'url': 'https://m.media-amazon.com/images/M/MV5BMDAyY2FhYjctNDc5OS00MDNlLThiMGUtY2UxYWVkNGY2ZjljXkEyXkFqcGc@._V1_QL75_UX280_CR0,0,280,414_.jpg', 'width': 280, 'height': 414}, {'url': 'https://m.media-amazon.com/images/M/MV5BMDAyY2FhYjctNDc5OS00MDNlLThiMGUtY2UxYWVkNGY2ZjljXkEyXkFqcGc@._V1_QL75_UX380_CR0,

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(response.json())[['primaryTitle', 'description', 'genres']]

In [None]:
df.shape

(250, 3)

In [None]:
df.head()

Unnamed: 0,primaryTitle,description,genres
0,The Shawshank Redemption,A banker convicted of uxoricide forms a friend...,[Drama]
1,The Godfather,The aging patriarch of an organized crime dyna...,"[Crime, Drama]"
2,The Dark Knight,When a menace known as the Joker wreaks havoc ...,"[Action, Crime, Drama]"
3,The Godfather Part II,The early life and career of Vito Corleone in ...,"[Crime, Drama]"
4,12 Angry Men,The jury in a New York City murder trial is fr...,"[Crime, Drama]"


In [None]:
# lower case the text
processed_description = df['description'].str.lower()

In [None]:
# remove punctuations
import string
translator = str.maketrans('', '', string.punctuation)
processed_description = processed_description.str.translate(translator)

In [None]:
# spelling checker
from textblob import TextBlob

processed_description = processed_description.apply(lambda x: str(TextBlob(x).correct()))

In [None]:
# removing stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

def remove_stopwords(text):
  new_text = []
  for word in text.split():
    if word in stopwords.words('english'):
      new_text.append('')
    else:
      new_text.append(word)
  x = new_text[:]
  new_text.clear()
  return " ".join(x)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
processed_description = processed_description.apply(lambda x: remove_stopwords(x))

In [None]:
# tokenization using spacy
import spacy
nlp = spacy.load('en_core_web_sm')

def tokenization(text):
  doc = nlp(text)
  return [token.text for token in doc]

In [None]:
processed_description = processed_description.apply(tokenization)

In [None]:
processed_description

Unnamed: 0,description
0,"[ , banker, convicted, , uxoricide, forms, ,..."
1,"[ , raging, patriarch, , organized, crime, d..."
2,"[ , menace, known, , joke, breaks, havoc, ..."
3,"[ , early, life, , career, , veto, corleone,..."
4,"[ , jury, , new, york, city, murder, trial, ..."
...,...
245,"[ , oklahoma, family, driven, , farm, , po..."
246,"[ , german, occupation, , poland, , acting,..."
247,"[ , clash, , sultan, , said, khan, leads, ..."
248,"[desperate, measures, , taken, , man, , tr..."


In [None]:
# stemming
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
def stem_text(text):
  return [stemmer.stem(word) for word in text]

In [None]:
processed_description = processed_description.apply(stem_text)

In [None]:
processed_description[0]

[' ',
 'banker',
 'convict',
 ' ',
 'uxoricid',
 'form',
 ' ',
 'friendship',
 '  ',
 'quarter',
 'centuri',
 '  ',
 'harden',
 'convict',
 ' ',
 'maintain',
 ' ',
 'innoc',
 ' ',
 'tri',
 ' ',
 'remain',
 'hope',
 ' ',
 'simpl',
 'compass']

In [None]:
# lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize(text):
  return [lemmatizer.lemmatize(word, pos='n') for word in text]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
processed_description = processed_description.apply(lemmatize)

In [None]:
df['description'] = processed_description

In [None]:
df

Unnamed: 0,primaryTitle,description,genres
0,The Shawshank Redemption,"[ , banker, convict, , uxoricid, form, , fri...",[Drama]
1,The Godfather,"[ , rage, patriarch, , organ, crime, dynasti...","[Crime, Drama]"
2,The Dark Knight,"[ , menac, known, , joke, break, havoc, , ...","[Action, Crime, Drama]"
3,The Godfather Part II,"[ , earli, life, , career, , veto, corleon, ...","[Crime, Drama]"
4,12 Angry Men,"[ , juri, , new, york, citi, murder, trial, ...","[Crime, Drama]"
...,...,...,...
245,The Grapes of Wrath,"[ , oklahoma, famili, driven, , farm, , po...",[Drama]
246,To Be or Not to Be,"[ , german, occup, , poland, , act, troup, ...","[Comedy, Romance, War]"
247,Gangs of Wasseypur,"[ , clash, , sultan, , said, khan, lead, ,...","[Action, Comedy, Crime]"
248,Drishyam,"[desper, measur, , taken, , man, , tri, ,...","[Crime, Drama, Mystery]"
