# ENSF 444 Project Group 31



### Step 0: Import Libraries

In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [26]:
import warnings
warnings.filterwarnings('ignore') #ignoring some deprication warnings

### Step 1: Data Input

The data used for this task is the TMDB 5000 Movie Dataset on Kaggle

In [27]:
data = pd.read_csv('tmdb_5000_movies.csv')
data.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995.0,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965000.0,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800.0
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285.0,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000.0,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500.0
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647.0,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674600.0,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466.0
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026.0,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939000.0,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106.0
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529.0,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100.0,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124.0


### Step 2: Data Processing

In [28]:
print(data.isnull().sum()) #checking for null values in the dataset

budget                     0
genres                     9
homepage                3103
id                         9
keywords                   9
original_language          9
original_title             9
overview                  12
popularity                 9
production_companies       9
production_countries       9
release_date              10
revenue                    9
runtime                   11
spoken_languages           9
status                     9
tagline                  854
title                      9
vote_average               9
vote_count                 9
dtype: int64


In [29]:
# Columns with minimal missing values (<20) that we want to keep:
# For numerical columns, fill with median (less sensitive to outliers)
data['runtime'].fillna(data['runtime'].median(), inplace=True)
data['release_date'].fillna(data['release_date'].mode()[0], inplace=True)  # Fill with most common date

# For text columns, fill with empty string
data['overview'].fillna('', inplace=True)

# Drop columns with many missing values or not relevant for popularity prediction
data.drop(['homepage', 'tagline', 'keywords', 'production_companies', 
           'production_countries', 'spoken_languages'], axis=1, inplace=True)

# Drop remaining rows with missing values
data.dropna(inplace=True)

# Verify no more missing values
print("\nMissing values after cleaning:")
print(data.isnull().sum())


Missing values after cleaning:
budget               0
genres               0
id                   0
original_language    0
original_title       0
overview             0
popularity           0
release_date         0
revenue              0
runtime              0
status               0
title                0
vote_average         0
vote_count           0
dtype: int64


#### Parse Data for JSON Columns

In [30]:
import ast
def parse_json(json_str):
    try:
        return [item['name'] for item in ast.literal_eval(json_str)]
    except:
        return []

data['genres_list'] = data['genres'].apply(parse_json)

#### Parse JSON columns

In [31]:
import ast

def parse_json(json_str):
    try:
        return [item['name'] for item in ast.literal_eval(json_str)]
    except:
        return []

data['genres_list'] = data['genres'].apply(parse_json)

# Create binary columns for top genres
top_genres = ['Action', 'Adventure', 'Animation', 'Comedy', 'Drama', 
              'Horror', 'Romance', 'Science Fiction', 'Thriller']
for genre in top_genres:
    data[genre] = data['genres_list'].apply(lambda x: 1 if genre in x else 0)

# Convert release_date to datetime and extract year/month features
data['release_date'] = pd.to_datetime(data['release_date'])
data['release_year'] = data['release_date'].dt.year
data['release_month'] = data['release_date'].dt.month

# Select features
features = ['budget', 'runtime', 'release_year', 'release_month'] + top_genres
target = 'popularity'
data.head()

Unnamed: 0,budget,genres,id,original_language,original_title,overview,popularity,release_date,revenue,runtime,...,Adventure,Animation,Comedy,Drama,Horror,Romance,Science Fiction,Thriller,release_year,release_month
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995.0,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,2009-12-10,2787965000.0,162.0,...,1,0,0,0,0,0,1,0,2009,12
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285.0,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,2007-05-19,961000000.0,169.0,...,1,0,0,0,0,0,0,0,2007,5
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",206647.0,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,2015-10-26,880674600.0,148.0,...,1,0,0,0,0,0,0,0,2015,10
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",49026.0,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,2012-07-16,1084939000.0,165.0,...,0,0,0,1,0,0,0,1,2012,7
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",49529.0,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,2012-03-07,284139100.0,132.0,...,1,0,0,0,0,0,1,0,2012,3


#### Preprocessing Setup

In [32]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

# After cleaning your data as previously discussed, define preprocessing:
preprocessor = ColumnTransformer(
    transformers=[
        # Numeric columns (scale them)
        ('num', StandardScaler(), ['budget', 'runtime']),
        
        # Categorical columns (one-hot encode)
        ('cat', OneHotEncoder(handle_unknown='ignore'), 
         ['original_language']),
    ],
    remainder='passthrough'  # Keeps other columns unchanged
)

#### Separate data into feature matrix and target vector

In [33]:
X = data[features]
y = data[target]

### Step 3: Implement Machine Learning Models

In [34]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

#### Model 1: Linear Regression

In [36]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate

# Create a pipeline with preprocessing and model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Evaluate the model using cross-validation
cv_results = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2')
print("Training Accuracy:", cv_results['train_score'].mean())
print("Testing Accuracy:", cv_results['test_score'].mean())

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\zacha\anaconda3\envs\ensf-ml\Lib\site-packages\pandas\core\indexes\base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\\_libs\\hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas\\_libs\\hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'original_language'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\zacha\anaconda3\envs\ensf-ml\Lib\site-packages\sklearn\utils\_indexing.py", line 361, in _get_column_indices
    col_idx = all_columns.get_loc(col)
              ^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\zacha\anaconda3\envs\ensf-ml\Lib\site-packages\pandas\core\indexes\base.py", line 3812, in get_loc
    raise KeyError(key) from err
KeyError: 'original_language'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\zacha\anaconda3\envs\ensf-ml\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\zacha\anaconda3\envs\ensf-ml\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\zacha\anaconda3\envs\ensf-ml\Lib\site-packages\sklearn\pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\zacha\anaconda3\envs\ensf-ml\Lib\site-packages\sklearn\pipeline.py", line 406, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\zacha\anaconda3\envs\ensf-ml\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\zacha\anaconda3\envs\ensf-ml\Lib\site-packages\sklearn\pipeline.py", line 1310, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\zacha\anaconda3\envs\ensf-ml\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\zacha\anaconda3\envs\ensf-ml\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\zacha\anaconda3\envs\ensf-ml\Lib\site-packages\sklearn\compose\_column_transformer.py", line 968, in fit_transform
    self._validate_column_callables(X)
  File "c:\Users\zacha\anaconda3\envs\ensf-ml\Lib\site-packages\sklearn\compose\_column_transformer.py", line 536, in _validate_column_callables
    transformer_to_input_indices[name] = _get_column_indices(X, columns)
                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\zacha\anaconda3\envs\ensf-ml\Lib\site-packages\sklearn\utils\_indexing.py", line 369, in _get_column_indices
    raise ValueError("A given column is not a column of the dataframe") from e
ValueError: A given column is not a column of the dataframe


#### Model 2: Random Forest

#### Model 3: SVM