# Data Preprocessing For Movie Recommendation System Model

# Importing Libraries

In [1]:
# importing libraries
import numpy as np
import pandas as pd
import ast
import pickle

# Load Dataset

In [2]:
# read dataset
movies = pd.read_csv("files/tmdb_movies.csv")
credits = pd.read_csv("files/tmdb_credits.csv")

In [3]:
# verifying data load
movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [4]:
# verifying data load
credits.head(2)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


## Data Load Confirmation
Get a quick sense of data volume by ensuring dataset were loaded correctly

In [5]:
# dataset shape
print("Shape of the Movie Dataset:", movies.shape)
print("Shape of the Credits Dataset:", credits.shape)

Shape of the Movie Dataset: (4803, 20)
Shape of the Credits Dataset: (4803, 4)


In [6]:
# Datasets Validation Check
assert movies.shape[0] == credits.shape[0], "Row counts do not match!"

## Merging Datasets
Combining relevant details into a single table for easier exploration and analysis.

In [7]:
# merges movies and credits where values in the "title" column match.
df = movies.merge(credits, on='title')

In [8]:
df.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [9]:
print("Shape of the Merged Dataset:", df.shape)

Shape of the Merged Dataset: (4809, 23)


# Sanity Check
- Remove Unnecessary Columns
- Formatting Columns Name
- Remove Duplicates
- Remove Nulls
- Remove Unwanted Data From Columns

In [10]:
# check columns
df.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

## Remove Unuseful Columns
Filters the DataFrame to retain only the essential columns needed for further analysis and discarding unnecessary data so that we can keep our dataset focused and manageable.

**Columns to keep:** **`genres`**, **`id`**, **`keywords`**, **`title`**, **`overview`**, **`release_date`**, **`cast`**, **`crew`**
rest of the columns are no longer needed to make recommendation

In [11]:
# keeping essential columns
df = df[["genres", "id", "keywords", "title", "overview", "release_date", "cast", "crew"]].copy()

In [12]:
# check new dataset
df.head(2)

Unnamed: 0,genres,id,keywords,title,overview,release_date,cast,crew
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",Avatar,"In the 22nd century, a paraplegic Marine is di...",2009-12-10,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",2007-05-19,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


## Renaming and Formatting Column Names
Cleaning and formatting column names to make them more readable and consistent.

To clean and standardize column names in the DataFrame by:
   - Removing extra whitespace
   - Replacing underscores with spaces
   - Capitalizing each word

In [13]:
# check columns
df.columns

Index(['genres', 'id', 'keywords', 'title', 'overview', 'release_date', 'cast',
       'crew'],
      dtype='object')

In [14]:
# formatting column names
df.columns = df.columns.str.strip().str.replace("_", " ").str.title()
df.columns

Index(['Genres', 'Id', 'Keywords', 'Title', 'Overview', 'Release Date', 'Cast',
       'Crew'],
      dtype='object')

In [15]:
# standardize column name
df.rename(columns={"Id": "ID"}, inplace=True)
df.columns

Index(['Genres', 'ID', 'Keywords', 'Title', 'Overview', 'Release Date', 'Cast',
       'Crew'],
      dtype='object')

## Remove Duplicates and Nulls
- Duplicate rows can skew analysis and statistical summaries. They may result from data collection or merging errors.
    - **duplicated().sum():** identify the number of completely duplicate rows — where all column values in a row are identical to another row. <br/><br/>
- Missing values can break algorithms or cause misleading analysis.
    - **isnull().sum():** identify how many missing (null/NaN) values exist in each column of the DataFrame.

In [16]:
# checking for duplicate rows
df.duplicated().sum()

0

In [17]:
# check for missing values
df.isnull().sum()

Genres          0
ID              0
Keywords        0
Title           0
Overview        3
Release Date    1
Cast            0
Crew            0
dtype: int64

**Note:** Since we do not have mass number null rows, we can remove them to ensure the dataset is fully complete

In [18]:
# remove all rows from the DataFrame that contain any missing values (NaN)
df.dropna(inplace=True)

In [19]:
# checking update
df.isnull().sum()

Genres          0
ID              0
Keywords        0
Title           0
Overview        0
Release Date    0
Cast            0
Crew            0
dtype: int64

## Remove Unwanted Data From Columns and Retrieve the Necessaries only
It’s important to simplify and clean column contents so they contain only the data that’s relevant for analysis or modeling. This helps reduce complexity and focus only on features that matter.

In [20]:
# checking dataset
df.head(1)

Unnamed: 0,Genres,ID,Keywords,Title,Overview,Release Date,Cast,Crew
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",Avatar,"In the 22nd century, a paraplegic Marine is di...",2009-12-10,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### Check Contents

In [21]:
# genres values
print("Genres Data:")
for i in range(5):
    print(df['Genres'].values[i])
    print()

Genres Data:
[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]

[{"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 28, "name": "Action"}]

[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 80, "name": "Crime"}]

[{"id": 28, "name": "Action"}, {"id": 80, "name": "Crime"}, {"id": 18, "name": "Drama"}, {"id": 53, "name": "Thriller"}]

[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 878, "name": "Science Fiction"}]



In [22]:
# keywords values
print("KeyWords Data:")
for i in range(5):
    print(df['Keywords'].values[i])
    print()

KeyWords Data:
[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]

[{"id": 270, "name": "ocean"}, {"id": 726, "name": "drug abuse"}, {"id": 911, "name": "exotic island"}, {"id": 1319, "name": "east india trading company"}, {"id": 2038, "name": "love of one's life"}, {"id": 2052, "name": "traitor"}, {"id": 2580, "n

In [23]:
# cast values
print("Cast Data:")
for i in range(5):
    print(df['Cast'].values[i])
    print()

Cast Data:
[{"cast_id": 242, "character": "Jake Sully", "credit_id": "5602a8a7c3a3685532001c9a", "gender": 2, "id": 65731, "name": "Sam Worthington", "order": 0}, {"cast_id": 3, "character": "Neytiri", "credit_id": "52fe48009251416c750ac9cb", "gender": 1, "id": 8691, "name": "Zoe Saldana", "order": 1}, {"cast_id": 25, "character": "Dr. Grace Augustine", "credit_id": "52fe48009251416c750aca39", "gender": 1, "id": 10205, "name": "Sigourney Weaver", "order": 2}, {"cast_id": 4, "character": "Col. Quaritch", "credit_id": "52fe48009251416c750ac9cf", "gender": 2, "id": 32747, "name": "Stephen Lang", "order": 3}, {"cast_id": 5, "character": "Trudy Chacon", "credit_id": "52fe48009251416c750ac9d3", "gender": 1, "id": 17647, "name": "Michelle Rodriguez", "order": 4}, {"cast_id": 8, "character": "Selfridge", "credit_id": "52fe48009251416c750ac9e1", "gender": 2, "id": 1771, "name": "Giovanni Ribisi", "order": 5}, {"cast_id": 7, "character": "Norm Spellman", "credit_id": "52fe48009251416c750ac9dd", 

In [24]:
# crew values
print("Crew Data:")
for i in range(5):
    print(df['Crew'].values[i])
    print()

Crew Data:
[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James 

### Convert Complex String Into Clean List and Extract Necessary Values

In [25]:
# function to extract corresponded genres from the columns
def extract_names(data):
    names = []
    for d in ast.literal_eval(data):
        names.append(d['name'])
    
    return names

In [26]:
# extract names from genres and replace it with the older values
df['Genres'] = df['Genres'].apply(extract_names)
df['Genres'].head(2)

0    [Action, Adventure, Fantasy, Science Fiction]
1                     [Adventure, Fantasy, Action]
Name: Genres, dtype: object

In [27]:
# extract names from keywords and replace it with the older values
df['Keywords'] = df['Keywords'].apply(extract_names)
df['Keywords'].head(2)

0    [culture clash, future, space war, space colon...
1    [ocean, drug abuse, exotic island, east india ...
Name: Keywords, dtype: object

In [28]:
# function extract corresponded actual names of the main actors
def extract_characters(data):
    names = []
    count_actor = 0
    for d in ast.literal_eval(data):
        if count_actor != 3:
            names.append(d['name'])
            count_actor += 1
        
        else:
            break
    
    return names

In [29]:
# extract names from cast and replace it with the older values
df['Cast'] = df['Cast'].apply(extract_characters)
df['Cast'].head(2)

0    [Sam Worthington, Zoe Saldana, Sigourney Weaver]
1       [Johnny Depp, Orlando Bloom, Keira Knightley]
Name: Cast, dtype: object

In [30]:
# function to extract directors
def extract_directors(data):
    names = []
    for director in ast.literal_eval(data):
        if director['job'] == 'Director':
            names.append(director['name'])
            break
    
    return names

In [31]:
# extract directors names from crew and replace it with the older values
df['Crew'] = df['Crew'].apply(extract_directors)
df['Crew'].head(2)

0     [James Cameron]
1    [Gore Verbinski]
Name: Crew, dtype: object

In [32]:
# dataset after extracting necessary value
df.head()

Unnamed: 0,Genres,ID,Keywords,Title,Overview,Release Date,Cast,Crew
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,"[culture clash, future, space war, space colon...",Avatar,"In the 22nd century, a paraplegic Marine is di...",2009-12-10,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,"[Adventure, Fantasy, Action]",285,"[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",2007-05-19,"[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,"[Action, Adventure, Crime]",206647,"[spy, based on novel, secret agent, sequel, mi...",Spectre,A cryptic message from Bond’s past sends him o...,2015-10-26,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,"[Action, Crime, Drama, Thriller]",49026,"[dc comics, crime fighter, terrorist, secret i...",The Dark Knight Rises,Following the death of District Attorney Harve...,2012-07-16,"[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,"[Action, Adventure, Science Fiction]",49529,"[based on novel, mars, medallion, space travel...",John Carter,"John Carter is a war-weary, former military ca...",2012-03-07,"[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


## Data Transformation

In [33]:
# converting each movie's overview from a full text string into a list of individual words
df['Overview Token'] = df['Overview'].apply(lambda x: x.split())

In [34]:
df.head(2)

Unnamed: 0,Genres,ID,Keywords,Title,Overview,Release Date,Cast,Crew,Overview Token
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,"[culture clash, future, space war, space colon...",Avatar,"In the 22nd century, a paraplegic Marine is di...",2009-12-10,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,"[Adventure, Fantasy, Action]",285,"[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",2007-05-19,"[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski],"[Captain, Barbossa,, long, believed, to, be, d..."


### Removing Spaces To Clarify The Clarify The Tags

In [35]:
cols = ["Keywords", "Genres", "Cast", "Crew"]

for item in cols:
    df[item] = df[item].apply(lambda data: [name.replace(" ", "") for name in data])

df.head(2)

Unnamed: 0,Genres,ID,Keywords,Title,Overview,Release Date,Cast,Crew,Overview Token
0,"[Action, Adventure, Fantasy, ScienceFiction]",19995,"[cultureclash, future, spacewar, spacecolony, ...",Avatar,"In the 22nd century, a paraplegic Marine is di...",2009-12-10,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,"[Adventure, Fantasy, Action]",285,"[ocean, drugabuse, exoticisland, eastindiatrad...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",2007-05-19,"[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."


### Creating New Tag Column

In [36]:
# creating new tag column
df["Tags"] = df['Overview Token'] + df['Keywords'] + df['Genres'] + df['Cast'] + df['Crew']

df.head(2)

Unnamed: 0,Genres,ID,Keywords,Title,Overview,Release Date,Cast,Crew,Overview Token,Tags
0,"[Action, Adventure, Fantasy, ScienceFiction]",19995,"[cultureclash, future, spacewar, spacecolony, ...",Avatar,"In the 22nd century, a paraplegic Marine is di...",2009-12-10,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin...","[In, the, 22nd, century,, a, paraplegic, Marin..."
1,"[Adventure, Fantasy, Action]",285,"[ocean, drugabuse, exoticisland, eastindiatrad...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",2007-05-19,"[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d...","[Captain, Barbossa,, long, believed, to, be, d..."


### Keeping Final Columns
**Columns to keep:** **`ID`**, **`Title`**, **`Overview`**, **`Release Date`**, **`Tags`**

In [37]:
new_df = df.drop(['Genres', 'Keywords', 'Overview Token', 'Cast', 'Crew'], axis = 1)
new_df.head()

Unnamed: 0,ID,Title,Overview,Release Date,Tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",2009-12-10,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",2007-05-19,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,2015-10-26,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,2012-07-16,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...",2012-03-07,"[John, Carter, is, a, war-weary,, former, mili..."


In [38]:
# converting the tag's list into string for effective prediction
new_df['Tags'] = new_df['Tags'].apply(lambda data: " ".join(data).lower())
new_df.head()

Unnamed: 0,ID,Title,Overview,Release Date,Tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",2009-12-10,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",2007-05-19,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,2015-10-26,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,2012-07-16,following the death of district attorney harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca...",2012-03-07,"john carter is a war-weary, former military ca..."


In [39]:
# checking final data to create tags
new_df['Tags'][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d action adventure fantasy sciencefiction samworthington zoesaldana sigourneyweaver jamescameron'

## Store the preproecessed data for further operations

In [40]:
# store as csv
new_df.to_csv("files/movie_cleaned.csv", index=False)

> # This dataset is prepared by MD. TUSHAR SHIHAB, Dept. of CSE