https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata?select=tmdb_5000_movies.csv

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import datetime

pd.set_option('display.max_rows', None)

In [2]:
#Filtered columns are what will be used by the ML models as well
col = ["budget","revenue","vote_average","vote_count","release_date"]
baseDF = pd.read_csv("tmdb_5000_movies.csv",usecols=col)

In [3]:
baseDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   budget        4803 non-null   int64  
 1   release_date  4802 non-null   object 
 2   revenue       4803 non-null   int64  
 3   vote_average  4803 non-null   float64
 4   vote_count    4803 non-null   int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 187.7+ KB


In [5]:
#Conversion of Dataframe to original Dataframe used for the project
baseDF["release_date"] = pd.to_datetime(baseDF.release_date)
baseDF["year"]= baseDF["release_date"].dt.strftime("%Y")
baseDF.drop(columns="release_date")
baseDF=baseDF[baseDF["revenue"]!=0]
baseDF=baseDF[baseDF["budget"]!=0]

succCOL = pd.DataFrame((baseDF["revenue"]-baseDF["budget"]*1.5)>0)
baseDF["Success"] = succCOL

#Shuffling of the rows to simulate random
baseDF = baseDF.sample(frac = 1)

In [6]:
baseDF.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3229 entries, 1136 to 240
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   budget        3229 non-null   int64         
 1   release_date  3229 non-null   datetime64[ns]
 2   revenue       3229 non-null   int64         
 3   vote_average  3229 non-null   float64       
 4   vote_count    3229 non-null   int64         
 5   year          3229 non-null   object        
 6   Success       3229 non-null   bool          
dtypes: bool(1), datetime64[ns](1), float64(1), int64(3), object(1)
memory usage: 179.7+ KB


The process below separates the Dataframe into 3 Dataframes which contain 1000 rows each.

In [7]:
dataOne=baseDF.head(1000)

In [8]:
dataSec=baseDF[~baseDF.index.isin(dataOne.index)]

In [9]:
dataTwo=dataSec.head(1000)

In [10]:
dataSec=dataSec[~dataSec.index.isin(dataTwo.index)]

In [11]:
dataThree=dataSec.head(1000)

In [12]:
dataOne.head()

Unnamed: 0,budget,release_date,revenue,vote_average,vote_count,year,Success
1136,31000000,2002-12-19,177394432,7.2,635,2002,True
633,65000000,2012-11-09,275293450,6.7,1429,2012,True
1544,31000000,2010-01-21,15134293,6.0,89,2010,False
364,90000000,2003-11-25,182290266,5.2,466,2003,True
1832,25000000,2000-12-14,152500343,6.8,951,2000,True


In [13]:
dataTwo.head()

Unnamed: 0,budget,release_date,revenue,vote_average,vote_count,year,Success
1654,25000000,2004-04-30,30114487,4.7,114,2004,False
3512,5300000,2002-05-17,3897799,7.0,266,2002,False
3,250000000,2012-07-16,1084939099,7.6,9106,2012,True
2979,28000000,2000-08-25,90449929,5.8,568,2000,True
651,65000000,2014-12-19,133821816,6.0,466,2014,True


In [14]:
dataThree.head()

Unnamed: 0,budget,release_date,revenue,vote_average,vote_count,year,Success
2913,11500000,2016-01-21,94073028,5.7,1406,2016,True
1233,40000000,2005-11-09,46442528,6.3,186,2005,False
4082,1000000,2011-09-30,34522221,6.8,77,2011,True
712,55000000,1999-12-16,100230832,6.8,422,1999,True
1946,25000000,2009-09-11,10589102,6.0,326,2009,False


In [15]:
#Export of the Dataframe
dataOne.to_csv("testDataOne.csv",index=False)
dataTwo.to_csv("testDataTwo.csv",index=False)
dataThree.to_csv("testDataThree.csv",index=False)