In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import ast
import json

from scipy.stats import randint
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
source = 'data/projdata'
metadata = pd.read_csv(os.path.join(source, 'movies_metadata.csv'), low_memory=False)

# transforming features to numeric values
metadata['release_date'] = pd.to_datetime(metadata['release_date'], errors='coerce')
metadata['budget'] = pd.to_numeric(metadata['budget'], errors='coerce')
metadata['revenue'] = pd.to_numeric(metadata['revenue'], errors='coerce')
metadata['runtime'] = pd.to_numeric(metadata['runtime'], errors='coerce')

# modifying adult column
metadata['adult'] = metadata['adult'].map({'True':True, 'False':False, True:True, False:False})
# filling NaN values with false, remove later if needed
metadata['adult'].fillna(False, inplace=True)
metadata['adult'] = metadata['adult'].astype(int)

# removing null values
metadata = metadata[pd.notnull(metadata['title'])]
metadata = metadata[pd.notnull(metadata['budget'])]
metadata = metadata[pd.notnull(metadata['revenue'])]
metadata = metadata[pd.notnull(metadata['runtime'])]
metadata = metadata[pd.notnull(metadata['release_date'])]

metadata['year'] = pd.to_datetime(metadata['release_date'], errors='coerce').dt.year.astype('Int64')
metadata['genre_list'] = metadata['genres'].apply(lambda x: [genre['name'] for genre in ast.literal_eval(x)])

metadata.info()
metadata.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  metadata['adult'].fillna(False, inplace=True)
  metadata['adult'].fillna(False, inplace=True)


<class 'pandas.core.frame.DataFrame'>
Index: 45130 entries, 0 to 45465
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   adult                  45130 non-null  int64         
 1   belongs_to_collection  4481 non-null   object        
 2   budget                 45130 non-null  float64       
 3   genres                 45130 non-null  object        
 4   homepage               7766 non-null   object        
 5   id                     45130 non-null  object        
 6   imdb_id                45118 non-null  object        
 7   original_language      45119 non-null  object        
 8   original_title         45130 non-null  object        
 9   overview               44435 non-null  object        
 10  popularity             45130 non-null  object        
 11  poster_path            44808 non-null  object        
 12  production_companies   45130 non-null  object        
 13  produc

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,genre_list
0,0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,"[Animation, Comedy, Family]"
1,0,,65000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,"[Adventure, Fantasy, Family]"
2,0,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995,"[Romance, Comedy]"
3,0,,16000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995,"[Comedy, Drama, Romance]"
4,0,"{'id': 96871, 'name': 'Father of the Bride Col...",0.0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995,[Comedy]


In [3]:
metadata_target = metadata[['original_title', 'adult', 'tagline', 'genre_list']]
metadata_target.dropna(subset = ['original_title', 'adult', 'tagline', 'genre_list'])

title_encoder = LabelEncoder()
metadata_target['original_title_numeric'] = LabelEncoder().fit_transform(metadata_target['original_title'])

company_binarizer = MultiLabelBinarizer()

# def company_helper(lst):
#   if isinstance(lst, list):
#     return [item['name'] for item in lst]
#   else:
#     return []
  
# metadata_target['production_companies'] = metadata_target['production_companies'].apply(company_helper)

# pd_companies = pd.DataFrame(company_binarizer.fit_transform(metadata_target['production_companies']), 
#                             columns=company_binarizer.classes_)

# tf-idf vectorization for tagline
# takes taglines and identifies most important words in each tagline
metadata_target['tagline'].fillna("", inplace=True)
tfidf_stop_words = TfidfVectorizer(stop_words='english') # removes stop words (e.g 'and', 'the', etc.)
tagline_modified = tfidf_stop_words.fit_transform(metadata_target['tagline'])

print(tagline_modified)
metadata_target.info()
metadata_target.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata_target['original_title_numeric'] = LabelEncoder().fit_transform(metadata_target['original_title'])
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  metadata_target['tagline'].fillna("", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view

  (1, 3950)	0.4418052503381264
  (1, 11815)	0.5063009061885739
  (1, 3241)	0.5847696387354109
  (1, 9389)	0.454435895126641
  (2, 6721)	0.2898529449583409
  (2, 8973)	0.45260360550530565
  (2, 4260)	0.46940490383266564
  (2, 12548)	0.7005671151577276
  (3, 4437)	0.39746905919771003
  (3, 6513)	0.7570321347412877
  (3, 8208)	0.35401296483918093
  (3, 4545)	0.3789399882241365
  (4, 6550)	0.3214970520590683
  (4, 10896)	0.5871254746637069
  (4, 7697)	0.564975293294245
  (4, 12468)	0.3297337626035932
  (4, 6137)	0.3521390155671543
  (5, 9547)	0.46765167128814794
  (5, 2731)	0.36824455058145517
  (5, 620)	0.5870344807913896
  (5, 6698)	0.5487152118088651
  (6, 12545)	0.3110977985956287
  (6, 7142)	0.5201592197436667
  (6, 10899)	0.4874541133916173
  (6, 5920)	0.4191834191625736
  :	:
  (45103, 613)	0.628418729076455
  (45103, 7428)	0.39422090032323137
  (45104, 8934)	0.6231089461914238
  (45104, 1105)	0.47196062230024993
  (45104, 4610)	0.4552150754339999
  (45104, 1121)	0.4263421715849747


Unnamed: 0,original_title,adult,tagline,genre_list,original_title_numeric
0,Toy Story,0,,"[Animation, Comedy, Family]",36641
1,Jumanji,0,Roll the dice and unleash the excitement!,"[Adventure, Fantasy, Family]",16110
2,Grumpier Old Men,0,Still Yelling. Still Fighting. Still Ready for...,"[Romance, Comedy]",12696
3,Waiting to Exhale,0,Friends are the people who let you be yourself...,"[Comedy, Drama, Romance]",38243
4,Father of the Bride Part II,0,Just When His World Is Back To Normal... He's ...,[Comedy],10746


In [4]:
# create our features and labels (X & y)
adult = metadata_target['adult'].values.reshape(-1, 1)
titles = metadata_target['original_title_numeric'].values.reshape(-1, 1)

# create X
if tagline_modified.shape[1] > 0:
    # Concatenate the features
    features_X = np.hstack([titles, adult, tagline_modified.toarray()])
else:
    print("Some feature arrays are empty!")


# create X
# features_X = np.hstack([adult, titles, tagline_modified.toarray(), pd_companies.values])

# create y
def genre_helper(genre_string):
    if isinstance(genre_string, str):
        try:
            return json.loads(genre_string)
        except json.JSONDecodeError:
            return []
    elif isinstance(genre_string, list):
        # This is to handle cases where the 'genre_string' is already a list
        return genre_string
    else:
        print(f"Unexpected type found: {type(genre_string)} - Value: {genre_string}")
        return []

metadata_target['genre_list_modified'] = metadata_target['genre_list'].apply(genre_helper)

genre_mlb = MultiLabelBinarizer()
labels_y = genre_mlb.fit_transform(metadata_target['genre_list_modified'])

# create train and test
x_train, x_test, y_train, y_test = train_test_split(features_X, labels_y, test_size = 0.25, random_state = 45)

# create decision tree, tune hyperparameters
final_d_tree = DecisionTreeClassifier(random_state = 45)
final_d_tree.fit(x_train, y_train)

# find best hyperparameters
# parameters = {
#     'criterion': ['gini', 'entropy'],
#     'max_depth': randint(1, 50),
#     'min_samples_split': randint(2, 11),
#     'min_samples_leaf': randint(1, 5)
# }
# randomized_search = RandomizedSearchCV(estimator=final_d_tree, param_distributions=parameters, n_iter=100, cv=5, random_state=45)
# randomized_search = randomized_search.fit(x_train, y_train)

# # find best parameters and best model
# final_params = randomized_search.best_params_
# best_rand_model = randomized_search.best_estimator_

# predict best random model on test set
preds = final_d_tree.predict(x_test)

# test accuracy
acc = accuracy_score(y_test, preds)

print(f'Accuracy : {acc * 100:.2f}%')

print(classification_report(y_test, preds, target_names=genre_mlb.classes_))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata_target['genre_list_modified'] = metadata_target['genre_list'].apply(genre_helper)


Accuracy : 7.61%
                 precision    recall  f1-score   support

         Action       0.31      0.27      0.29      1671
      Adventure       0.21      0.18      0.20       888
      Animation       0.22      0.20      0.21       505
         Comedy       0.39      0.35      0.37      3299
          Crime       0.21      0.17      0.19      1096
    Documentary       0.16      0.17      0.17       973
          Drama       0.51      0.50      0.51      5074
         Family       0.23      0.19      0.20       721
        Fantasy       0.15      0.13      0.14       580
        Foreign       0.07      0.08      0.08       408
        History       0.06      0.05      0.06       336
         Horror       0.35      0.32      0.33      1176
          Music       0.14      0.11      0.13       409
        Mystery       0.12      0.11      0.11       584
        Romance       0.25      0.22      0.24      1686
Science Fiction       0.25      0.23      0.24       751
       TV Mov

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
