In [1]:
# importing all the necessary libraries

import pandas as pd
import ast
import numpy as np

## Preprocessing movies_df dataset

In [2]:
# Reading the csv
movies_df = pd.read_csv('Resources/tmdb_5000_movies.csv')
movies_df.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [3]:
#Exploring the dataset
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

# Languages

In [4]:
movies_df['original_language'].unique()

array(['en', 'ja', 'fr', 'zh', 'es', 'de', 'hi', 'ru', 'ko', 'te', 'cn',
       'it', 'nl', 'ta', 'sv', 'th', 'da', 'xx', 'hu', 'cs', 'pt', 'is',
       'tr', 'nb', 'af', 'pl', 'he', 'ar', 'vi', 'ky', 'id', 'ro', 'fa',
       'no', 'sl', 'ps', 'el'], dtype=object)

In [5]:
language_mapping = {
    'en': 'English', 'ja': 'Japanese', 'fr': 'French', 'zh': 'Chinese',
    'es': 'Spanish', 'de': 'German', 'hi': 'Hindi', 'ru': 'Russian',
    'ko': 'Korean', 'te': 'Telugu', 'cn': 'Chinese', 'it': 'Italian',
    'nl': 'Dutch', 'ta': 'Tamil', 'sv': 'Swedish', 'th': 'Thai',
    'da': 'Danish', 'xx': 'Unknown', 'hu': 'Hungarian', 'cs': 'Czech',
    'pt': 'Portuguese', 'is': 'Icelandic', 'tr': 'Turkish', 'nb': 'Norwegian',
    'af': 'Afrikaans', 'pl': 'Polish', 'he': 'Hebrew', 'ar': 'Arabic',
    'vi': 'Vietnamese', 'ky': 'Kyrgyz', 'id': 'Indonesian', 'ro': 'Romanian',
    'fa': 'Farsi', 'no': 'Norwegian', 'sl': 'Slovenian', 'ps': 'Pashto',
    'el': 'Greek'
}

In [6]:
movies_df['Language'] = movies_df['original_language'].map(language_mapping)

# Extracting Release Year

In [7]:
movies_df['Year'] = movies_df['release_date'].str[:4]

# Extracting Release Month

In [8]:
movies_df['Month'] = movies_df['release_date'].str[5:7]

# Binning Runtime

In [9]:
bins = [0, 75, 120, 180, float('inf')]

bin_labels = ['Short', 'Medium', 'Long', 'Very Long']

In [10]:
movies_df['Length'] = pd.cut(movies_df['runtime'], bins=bins, labels=bin_labels)

# Genres Extraction

In [11]:
# Using lamba function to change cell data into python objects
movies_df['genres'] = movies_df['genres'].map(lambda x: ast.literal_eval(x))

In [12]:
# Extracting the genres from the list of dictionaries using for loops

def genresList(x):
    genre = []
   
    st = " "
    for i in x:
        if i.get('name') == 'Science Fiction':
            scifi = 'Sci-Fi'
            genre.append(scifi)
        else:
            genre.append(i.get('name'))
    if genre == []:
        return np.NaN
    else:
        return (st.join(genre))


In [13]:
movies_df['genres_list'] = movies_df['genres'].map(lambda x: genresList(x))
movies_df['genres_list']

0       Action Adventure Fantasy Sci-Fi
1              Adventure Fantasy Action
2                Action Adventure Crime
3           Action Crime Drama Thriller
4               Action Adventure Sci-Fi
                     ...               
4798              Action Crime Thriller
4799                     Comedy Romance
4800      Comedy Drama Romance TV Movie
4801                                NaN
4802                        Documentary
Name: genres_list, Length: 4803, dtype: object

# Production Countries Extraction 

In [14]:
# Extracting Countries
def production_countries1(x):
    prod_countries = []
    x = ast.literal_eval(x)  # Convert the string to a dictionary
    for i in x:
        prod_countries.append(i.get('name'))
    if prod_countries == []:
        return np.NaN
    else:
        return ",".join(prod_countries)

movies_df['Production Countries'] = movies_df['production_countries'].map(lambda x: production_countries1(x))

In [15]:
movies_df['Production Countries']

0       United States of America,United Kingdom
1                      United States of America
2       United Kingdom,United States of America
3                      United States of America
4                      United States of America
                         ...                   
4798            Mexico,United States of America
4799                                        NaN
4800                   United States of America
4801             United States of America,China
4802                   United States of America
Name: Production Countries, Length: 4803, dtype: object

In [16]:
clean_df = pd.DataFrame(movies_df[['id', 'title', 'vote_average', 'vote_count', 'revenue', 'genres_list', 
                                   'Month', 'Year', 'Production Countries', 'Language', 'Length']])

clean_df.head()

Unnamed: 0,id,title,vote_average,vote_count,revenue,genres_list,Month,Year,Production Countries,Language,Length
0,19995,Avatar,7.2,11800,2787965087,Action Adventure Fantasy Sci-Fi,12,2009,"United States of America,United Kingdom",English,Long
1,285,Pirates of the Caribbean: At World's End,6.9,4500,961000000,Adventure Fantasy Action,5,2007,United States of America,English,Long
2,206647,Spectre,6.3,4466,880674609,Action Adventure Crime,10,2015,"United Kingdom,United States of America",English,Long
3,49026,The Dark Knight Rises,7.6,9106,1084939099,Action Crime Drama Thriller,7,2012,United States of America,English,Long
4,49529,John Carter,6.1,2124,284139100,Action Adventure Sci-Fi,3,2012,United States of America,English,Long


# Creating Dummies for Country

In [17]:
country_dummies = clean_df['Production Countries'].str.get_dummies(sep=',')
country_dummies.head(1)

Unnamed: 0,Afghanistan,Algeria,Angola,Argentina,Aruba,Australia,Austria,Bahamas,Belgium,Bhutan,...,Sweden,Switzerland,Taiwan,Thailand,Tunisia,Turkey,Ukraine,United Arab Emirates,United Kingdom,United States of America
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [18]:
country_df = pd.concat([clean_df, country_dummies], axis=1)
country_df.drop(columns='Production Countries', inplace=True)
country_df.head(1)

Unnamed: 0,id,title,vote_average,vote_count,revenue,genres_list,Month,Year,Language,Length,...,Sweden,Switzerland,Taiwan,Thailand,Tunisia,Turkey,Ukraine,United Arab Emirates,United Kingdom,United States of America
0,19995,Avatar,7.2,11800,2787965087,Action Adventure Fantasy Sci-Fi,12,2009,English,Long,...,0,0,0,0,0,0,0,0,1,1


# Creating Dummies for Genre

In [19]:
genres_dummies = country_df['genres_list'].str.get_dummies(sep=' ')
genres_dummies.head(1)

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,Foreign,...,Horror,Movie,Music,Mystery,Romance,Sci-Fi,TV,Thriller,War,Western
0,1,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [20]:
genre_df = pd.concat([country_df, genres_dummies], axis=1)
genre_df.drop(columns = ['genres_list'],inplace=True)
genre_df.rename(columns={'vote_average': 'Average Rating', 'vote_count':'Vote Count', 'revenue': 'Revenue','title':'Movie', 'overview': 'Description'},inplace=True)
genre_df.head(1)

Unnamed: 0,id,Movie,Average Rating,Vote Count,Revenue,Month,Year,Language,Length,Afghanistan,...,Horror,Movie.1,Music,Mystery,Romance,Sci-Fi,TV,Thriller,War,Western
0,19995,Avatar,7.2,11800,2787965087,12,2009,English,Long,0,...,0,0,0,0,0,1,0,0,0,0


# Creating Dummies for Language

In [21]:
lang_dummies = pd.get_dummies(genre_df['Language']).astype('int')
lang_dummies.head(1)

Unnamed: 0,Afrikaans,Arabic,Chinese,Czech,Danish,Dutch,English,Farsi,French,German,...,Russian,Slovenian,Spanish,Swedish,Tamil,Telugu,Thai,Turkish,Unknown,Vietnamese
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
lang_df = pd.concat([genre_df,lang_dummies], axis=1)
lang_df.drop(columns='Language',inplace=True)
lang_df.head(1)


Unnamed: 0,id,Movie,Average Rating,Vote Count,Revenue,Month,Year,Length,Afghanistan,Algeria,...,Russian,Slovenian,Spanish,Swedish,Tamil,Telugu,Thai,Turkish,Unknown,Vietnamese
0,19995,Avatar,7.2,11800,2787965087,12,2009,Long,0,0,...,0,0,0,0,0,0,0,0,0,0


# Creating Dummies for Length

In [23]:
length_dummies = pd.get_dummies(lang_df['Length']).astype('int')
length_dummies.head(1)

Unnamed: 0,Short,Medium,Long,Very Long
0,0,0,1,0


In [24]:
length_df =pd.concat([lang_df,length_dummies], axis=1)
length_df.drop(columns='Length', inplace=True)
length_df.head(1)

Unnamed: 0,id,Movie,Average Rating,Vote Count,Revenue,Month,Year,Afghanistan,Algeria,Angola,...,Tamil,Telugu,Thai,Turkish,Unknown,Vietnamese,Short,Medium,Long,Very Long
0,19995,Avatar,7.2,11800,2787965087,12,2009,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [27]:
dummy_df = length_df.copy()
dummy_df = dummy_df.rename(columns={'id':'movie_id'})
dummy_df.head()

Unnamed: 0,movie_id,Movie,Average Rating,Vote Count,Revenue,Month,Year,Afghanistan,Algeria,Angola,...,Tamil,Telugu,Thai,Turkish,Unknown,Vietnamese,Short,Medium,Long,Very Long
0,19995,Avatar,7.2,11800,2787965087,12,2009,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,285,Pirates of the Caribbean: At World's End,6.9,4500,961000000,5,2007,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,206647,Spectre,6.3,4466,880674609,10,2015,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,49026,The Dark Knight Rises,7.6,9106,1084939099,7,2012,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,49529,John Carter,6.1,2124,284139100,3,2012,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [28]:
dummy_df.to_csv('Resources/dummy_moviedata_df.csv')