# Your goal is to create a predictive model that will predict the critic_rating. Using the full data set that you originally read into Python (with the erroneous years fixed):

In [1]:
import pandas as pd
import numpy as np

movie = pd.read_excel('movie_reviews.xlsx')
df = movie.copy()
df.head()

Unnamed: 0,movie_title,movie_info,critics_consensus,rating,genre,directors,writers,cast,in_theaters_date,on_streaming_date,runtime_in_minutes,studio_name,critic_rating,critic_count,audience_rating,audience_count
0,Percy Jackson & the Olympians: The Lightning T...,A teenager discovers he's the descendant of a ...,Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,Craig Titley,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2010-06-29,83.0,20th Century Fox,49,144,53.0,254287.0
1,Please Give,Kate has a lot on her mind. There's the ethics...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,2010-10-19,90.0,Sony Pictures Classics,86,140,64.0,11567.0
2,10,Blake Edwards' 10 stars Dudley Moore as George...,,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,1997-08-27,118.0,Waner Bros.,68,22,53.0,14670.0
3,12 Angry Men (Twelve Angry Men),"A Puerto Rican youth is on trial for murder, a...",Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,2001-03-06,95.0,Criterion Collection,100,51,97.0,105000.0
4,"20,000 Leagues Under The Sea","This 1954 Disney version of Jules Verne's 20,0...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,2003-05-20,127.0,Disney,89,27,74.0,68860.0


In [2]:
# view only rows with missing in_theaters_date
df[df['in_theaters_date'].isna()]
# there are 815 rows with missing date, we may consider drop it
df = df.dropna(subset=['in_theaters_date'])
df[df['in_theaters_date'].isna()]
# no missing NA
df['in_theaters_date'] = pd.to_datetime(df['in_theaters_date'], errors='coerce')
df['in_theatre_year'] = df['in_theaters_date'].dt.year.astype(int)

In [3]:
df['movie_title'].value_counts()

movie_title
Hamlet                  5
Home                    5
Gloria                  4
Robin Hood              4
Little Women            4
                       ..
Eraserhead              1
Erik the Viking         1
Erin Brockovich         1
Ernest Scared Stupid    1
Zulu Dawn               1
Name: count, Length: 15333, dtype: int64

In [4]:
df[df['movie_title']=='Home']

Unnamed: 0,movie_title,movie_info,critics_consensus,rating,genre,directors,writers,cast,in_theaters_date,on_streaming_date,runtime_in_minutes,studio_name,critic_rating,critic_count,audience_rating,audience_count,in_theatre_year
1505,Home,A single mother struggling to raise her six ch...,,NR,"Comedy, Documentary, Drama, Special Interest","Jeffrey Togman, Jeffrey M. Togman",,"Mary Rigby-Abernathy, Sheree Farmer, Mary Aber...",2005-03-10,2006-07-25,91.0,Kikker Arts,38,8,40.0,367.0,2005
1734,Home,Inga (Academy award winner Marcia Gay Harden) ...,"Disconcerting and sometimes eerie, this enigma...",PG-13,"Art House & International, Documentary, Drama",Mary Haverstick,Mary Haverstick,"Marcia Gay Harden, Marian Seldes, Michael Gast...",2009-05-01,2009-06-22,118.0,Diaphana Films,94,35,73.0,2425.0,2009
1738,Home,"More than a film, HOME was conceived as a gift...",,NR,"Documentary, Drama, Kids & Family",Yann Arthus-Bertrand,,"Yann Arthus-Bertrand, Glenn Close, Jacques Gam...",2011-02-04,2009-06-05,120.0,FilmBuff,0,5,86.0,1884.0,2011
7398,Home,We first meet 33 year old Jack Hall in the mid...,,NR,Drama,Jono Oliver,Jono Oliver,"Gbenga Akinnagbe, Tawny Cypress, Danny Hoch, J...",2013-11-22,2014-03-25,112.0,eOne,88,8,82.0,90.0,2013
7399,Home,"When Oh, a loveable misfit from another planet...","Colorful, silly, and utterly benign, Home is a...",PG,"Action & Adventure, Animation, Comedy, Kids & ...",Tim Johnson,"Matt Ember, Tom J. Astle","Jim Parsons, Rihanna, Jennifer Lopez, Steve Ma...",2015-03-26,2015-07-28,93.0,DreamWorks Animation,50,131,64.0,56104.0,2015


In [5]:
df = df[~df.duplicated(subset=['movie_title', 'in_theaters_date', 'runtime_in_minutes', 'studio_name'], keep=False)]

In [6]:
# drop column of critics_consensus
df = df.drop('critics_consensus', axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15821 entries, 0 to 16637
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   movie_title         15821 non-null  object        
 1   movie_info          15813 non-null  object        
 2   rating              15821 non-null  object        
 3   genre               15815 non-null  object        
 4   directors           15738 non-null  object        
 5   writers             14627 non-null  object        
 6   cast                15581 non-null  object        
 7   in_theaters_date    15821 non-null  datetime64[ns]
 8   on_streaming_date   15821 non-null  datetime64[ns]
 9   runtime_in_minutes  15696 non-null  float64       
 10  studio_name         15533 non-null  object        
 11  critic_rating       15821 non-null  int64         
 12  critic_count        15821 non-null  int64         
 13  audience_rating     15665 non-null  float64       


In [7]:
# drop all rows with NA
df = df.dropna()

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14090 entries, 0 to 16637
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   movie_title         14090 non-null  object        
 1   movie_info          14090 non-null  object        
 2   rating              14090 non-null  object        
 3   genre               14090 non-null  object        
 4   directors           14090 non-null  object        
 5   writers             14090 non-null  object        
 6   cast                14090 non-null  object        
 7   in_theaters_date    14090 non-null  datetime64[ns]
 8   on_streaming_date   14090 non-null  datetime64[ns]
 9   runtime_in_minutes  14090 non-null  float64       
 10  studio_name         14090 non-null  object        
 11  critic_rating       14090 non-null  int64         
 12  critic_count        14090 non-null  int64         
 13  audience_rating     14090 non-null  float64       


### 7. Split the data into a training and test set, with the training data including movies released in theatres before 2010 and the test data including movies released in theatres in 2010 and after.

In [9]:
# Split the data based on the release year
train_data = df[df['in_theatre_year'] < 2010]  # Movies released before 2010
train_data.head(5)

Unnamed: 0,movie_title,movie_info,rating,genre,directors,writers,cast,in_theaters_date,on_streaming_date,runtime_in_minutes,studio_name,critic_rating,critic_count,audience_rating,audience_count,in_theatre_year
2,10,Blake Edwards' 10 stars Dudley Moore as George...,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,1997-08-27,118.0,Waner Bros.,68,22,53.0,14670.0,1979
3,12 Angry Men (Twelve Angry Men),"A Puerto Rican youth is on trial for murder, a...",NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,2001-03-06,95.0,Criterion Collection,100,51,97.0,105000.0,1957
4,"20,000 Leagues Under The Sea","This 1954 Disney version of Jules Verne's 20,0...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,2003-05-20,127.0,Disney,89,27,74.0,68860.0,1954
5,"10,000 B.C.",A young outcast from a primitive tribe is forc...,PG-13,"Action & Adventure, Classics, Drama",Roland Emmerich,"Roland Emmerich, Harald Kloser","Steven Strait, Camilla Belle, Cliff Curtis, Jo...",2008-03-07,2008-06-24,109.0,Warner Bros. Pictures,8,148,37.0,411087.0,2008
6,The 39 Steps,A man in London tries to help a counterespiona...,NR,"Action & Adventure, Classics, Mystery & Suspense",Alfred Hitchcock,"Alma Reville, Charles Bennett, Ian Hay","Robert Donat, Madeleine Carroll, Godfrey Tearl...",1935-08-01,1935-06-06,87.0,Gaumont British Distributors,96,47,86.0,23827.0,1935


In [10]:
test_data = df[df['in_theatre_year'] >= 2010]  # Movies released in 2010 and after
test_data.head(5)

Unnamed: 0,movie_title,movie_info,rating,genre,directors,writers,cast,in_theaters_date,on_streaming_date,runtime_in_minutes,studio_name,critic_rating,critic_count,audience_rating,audience_count,in_theatre_year
0,Percy Jackson & the Olympians: The Lightning T...,A teenager discovers he's the descendant of a ...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,Craig Titley,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2010-06-29,83.0,20th Century Fox,49,144,53.0,254287.0,2010
1,Please Give,Kate has a lot on her mind. There's the ethics...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,2010-10-19,90.0,Sony Pictures Classics,86,140,64.0,11567.0,2010
97,Fireflies in the Garden,"To an outsider, the Taylors are the very pictu...",R,Drama,Dennis Lee (III),Dennis Lee (III),"Emily Watson, Julia Roberts, Ryan Reynolds, Wi...",2011-10-14,2012-02-07,98.0,Senator Entertainment,22,54,45.0,45150.0,2011
130,The Oxford Murders,When an elderly woman is viciously murdered in...,R,"Art House & International, Drama, Horror, Myst...",√Ålex de la Iglesia,"√Ålex de la Iglesia, Jorge Guerricaechevarr√≠a","Elijah Wood, John Hurt, Julie Cox, Leonor Watl...",2010-08-06,2010-10-05,110.0,Magnolia Pictures,9,11,33.0,14548.0,2010
155,Priest,"Priest, a western-fused post-apocalyptic thril...",PG-13,"Action & Adventure, Horror, Mystery & Suspense...",Scott Stewart,"Scott Stewart, Cory Goodman","Paul Bettany, Maggie Q, Cam Gigandet, Lily Col...",2011-05-13,2011-08-16,83.0,Screen Gems/SONY PICTURES,15,99,46.0,86566.0,2011


### 8. If your goal is to predict the critic_rating before the first critic or audience rating gets posted for a movie, which columns in the data should you NOT use to create features? Update your training and test data sets to NOT include these columns.

Since our goal is to predict the critic_rating before the first critic or audience rating gets posted for a movie, we should not use columns that provide information that would not be available before the first ratings. 

**critics_consensus**: This column might contain information based on critic reviews, so it should not be used if we want to predict critic ratings before any critic ratings are available.

**audience_rating**: Similarly, we should not use audience ratings as a feature for predicting critic ratings before the first audience ratings are available.

**audience_count**: This column provides information about the audience count, which would not be available before the first audience rating is posted.

In [34]:
# Columns to exclude
exclude_columns = ['audience_rating', 'audience_count', 'critic_count']
 
# Create features by excluding the specified columns
data = df.drop(columns=exclude_columns)

### 9. Using only the training data, create a new DataFrame containing the following ID column and features:
• movie_title

• runtime_in_minutes

• NEW: kid_friendly (1 if G or PG, 0 if other ratings)

• NEW: dummy variable columns for each genre


In [35]:
data['movie_title']

0        Percy Jackson & the Olympians: The Lightning T...
1                                              Please Give
2                                                       10
3                          12 Angry Men (Twelve Angry Men)
4                             20,000 Leagues Under The Sea
                               ...                        
16632                                                 Zoom
16633                                            Zoot Suit
16634                                             Zootopia
16636                                                 Zulu
16637                                            Zulu Dawn
Name: movie_title, Length: 14090, dtype: object

In [36]:
# Create a binary array based on the condition
kid_array = np.where((data['rating'] == 'G') | (data['rating'] == 'PG'), 1, 0)
kid_array

array([1, 0, 0, ..., 1, 1, 1])

In [37]:
# add to training data
data.loc[:, 'kid_friendly'] = kid_array
data.head(3)

Unnamed: 0,movie_title,movie_info,rating,genre,directors,writers,cast,in_theaters_date,on_streaming_date,runtime_in_minutes,studio_name,critic_rating,in_theatre_year,kid_friendly
0,Percy Jackson & the Olympians: The Lightning T...,A teenager discovers he's the descendant of a ...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,Craig Titley,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2010-06-29,83.0,20th Century Fox,49,2010,1
1,Please Give,Kate has a lot on her mind. There's the ethics...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,2010-10-19,90.0,Sony Pictures Classics,86,2010,0
2,10,Blake Edwards' 10 stars Dudley Moore as George...,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,1997-08-27,118.0,Waner Bros.,68,1979,0


In [38]:
# one-hot encoding
data['genre'].value_counts()

genre
Drama                                                                            1518
Comedy                                                                           1052
Comedy, Drama                                                                     775
Drama, Mystery & Suspense                                                         634
Art House & International, Drama                                                  500
                                                                                 ... 
Comedy, Kids & Family, Science Fiction & Fantasy, Romance                           1
Drama, Musical & Performing Arts, Science Fiction & Fantasy, Special Interest       1
Art House & International, Horror, Special Interest                                 1
Action & Adventure, Documentary, Sports & Fitness                                   1
Animation, Drama, Romance                                                           1
Name: count, Length: 979, dtype: int64

In [39]:
# Split genres into individual genres and explode the DataFrame
# 'explode' is used to transform the list-like values into separate rows.
df_exploded = data.assign(genre=data['genre'].str.split(', ')).explode('genre')
# keep only one genre
df_exploded = df_exploded[~df_exploded.duplicated(subset=['movie_title', 'in_theaters_date', 'runtime_in_minutes', 'studio_name'])]

In [19]:
# Group by the 'genre' column and count occurrences
genre_counts = df_exploded.groupby('genre').size().reset_index(name='count')
# Display the grouped DataFrame
print(genre_counts)

                        genre  count
0          Action & Adventure   2123
1                   Animation    171
2   Art House & International   1093
3                    Classics    982
4                      Comedy   2111
5                 Cult Movies     15
6                 Documentary    269
7                       Drama   1620
8                      Horror    402
9               Kids & Family     25
10  Musical & Performing Arts     11
11         Mystery & Suspense     50
12                    Romance      2
13  Science Fiction & Fantasy     12
14                    Western      2


In [40]:
data = pd.merge(df_exploded, genre_counts, on='genre', how='left')
data['genre'].value_counts()

genre
Comedy                       3189
Drama                        3152
Action & Adventure           3058
Art House & International    1643
Classics                     1017
Horror                        710
Documentary                   694
Animation                     287
Mystery & Suspense            230
Kids & Family                  35
Science Fiction & Fantasy      26
Cult Movies                    17
Musical & Performing Arts      16
Romance                        10
Western                         6
Name: count, dtype: int64

In [41]:
# make the column numeric using dummy variables
genre_dummies = pd.get_dummies(data['genre'], prefix='genre')
data = pd.concat([data, genre_dummies], axis=1)
data = data.astype({col: int for col in data.select_dtypes(include='bool').columns})
data.head(3)

Unnamed: 0,movie_title,movie_info,rating,genre,directors,writers,cast,in_theaters_date,on_streaming_date,runtime_in_minutes,...,genre_Cult Movies,genre_Documentary,genre_Drama,genre_Horror,genre_Kids & Family,genre_Musical & Performing Arts,genre_Mystery & Suspense,genre_Romance,genre_Science Fiction & Fantasy,genre_Western
0,Percy Jackson & the Olympians: The Lightning T...,A teenager discovers he's the descendant of a ...,PG,Action & Adventure,Chris Columbus,Craig Titley,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2010-06-29,83.0,...,0,0,0,0,0,0,0,0,0,0
1,Please Give,Kate has a lot on her mind. There's the ethics...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,2010-10-19,90.0,...,0,0,0,0,0,0,0,0,0,0
2,10,Blake Edwards' 10 stars Dudley Moore as George...,R,Comedy,Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,1997-08-27,118.0,...,0,0,0,0,0,0,0,0,0,0


### 10. Create 3 new features that you think will do a good job predicting the critic_rating. Each new feature should use various combinations of the columns from your training data.

Create a new binary variable '**prior_to_1970**': 1970s as a divider in the history of movies

In [42]:
data['in_theaters_date'] = pd.to_datetime(data['in_theaters_date'], errors='coerce')
data['in_theatre_year'] = data['in_theaters_date'].dt.year.astype(int)
data['in_theatre_year'].min() 

1930

In [43]:
data['in_theatre_year'].max()

2029

In [44]:
data['prior_to_1970'] = data['in_theatre_year'].apply(lambda x: 0 if x < 1970 else 1)
data.head(3)

Unnamed: 0,movie_title,movie_info,rating,genre,directors,writers,cast,in_theaters_date,on_streaming_date,runtime_in_minutes,...,genre_Documentary,genre_Drama,genre_Horror,genre_Kids & Family,genre_Musical & Performing Arts,genre_Mystery & Suspense,genre_Romance,genre_Science Fiction & Fantasy,genre_Western,prior_to_1970
0,Percy Jackson & the Olympians: The Lightning T...,A teenager discovers he's the descendant of a ...,PG,Action & Adventure,Chris Columbus,Craig Titley,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2010-06-29,83.0,...,0,0,0,0,0,0,0,0,0,1
1,Please Give,Kate has a lot on her mind. There's the ethics...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,2010-10-19,90.0,...,0,0,0,0,0,0,0,0,0,1
2,10,Blake Edwards' 10 stars Dudley Moore as George...,R,Comedy,Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,1997-08-27,118.0,...,0,0,0,0,0,0,0,0,0,1


Create a new dummy variable'**release_season**': Different seasons might have varying trends in movie ratings.

In [45]:
data['release_season'] = pd.to_datetime(data['in_theaters_date']).dt.month
data['release_season'] = data['release_season'].apply(lambda month: 'Winter' if month in [12, 1, 2] else ('Spring' if month in [3, 4, 5] else ('Summer' if month in [6, 7, 8] else 'Fall')))

In [46]:
one_hot_release_season = pd.get_dummies(data['release_season'], prefix='release_season')

# Concatenate the one-hot encoded columns to the existing DataFrame
data = pd.concat([data, one_hot_release_season], axis=1)
data = data.astype({col: int for col in data.select_dtypes(include='bool').columns})
data.head(3)

Unnamed: 0,movie_title,movie_info,rating,genre,directors,writers,cast,in_theaters_date,on_streaming_date,runtime_in_minutes,...,genre_Mystery & Suspense,genre_Romance,genre_Science Fiction & Fantasy,genre_Western,prior_to_1970,release_season,release_season_Fall,release_season_Spring,release_season_Summer,release_season_Winter
0,Percy Jackson & the Olympians: The Lightning T...,A teenager discovers he's the descendant of a ...,PG,Action & Adventure,Chris Columbus,Craig Titley,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2010-06-29,83.0,...,0,0,0,0,1,Winter,0,0,0,1
1,Please Give,Kate has a lot on her mind. There's the ethics...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,2010-10-19,90.0,...,0,0,0,0,1,Spring,0,1,0,0
2,10,Blake Edwards' 10 stars Dudley Moore as George...,R,Comedy,Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,1997-08-27,118.0,...,0,0,0,0,1,Fall,1,0,0,0


Create a new dummy variable '**streaming_lag_years**': might capture the delay in years between a movie's theatrical release and its availability on streaming platforms.

In [47]:
data['streaming_lag'] = pd.to_datetime(data['on_streaming_date']) - pd.to_datetime(data['in_theaters_date'])
data['streaming_lag_years'] = data['streaming_lag'].dt.total_seconds() / (365.25 * 24 * 3600)  # Average seconds in a year

# Convert to integer if needed
data['streaming_lag_years'] = data['streaming_lag_years'].astype(int)

In [48]:
data.head(5)

Unnamed: 0,movie_title,movie_info,rating,genre,directors,writers,cast,in_theaters_date,on_streaming_date,runtime_in_minutes,...,genre_Science Fiction & Fantasy,genre_Western,prior_to_1970,release_season,release_season_Fall,release_season_Spring,release_season_Summer,release_season_Winter,streaming_lag,streaming_lag_years
0,Percy Jackson & the Olympians: The Lightning T...,A teenager discovers he's the descendant of a ...,PG,Action & Adventure,Chris Columbus,Craig Titley,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2010-06-29,83.0,...,0,0,1,Winter,0,0,0,1,137 days,0
1,Please Give,Kate has a lot on her mind. There's the ethics...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,2010-10-19,90.0,...,0,0,1,Spring,0,1,0,0,172 days,0
2,10,Blake Edwards' 10 stars Dudley Moore as George...,R,Comedy,Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,1997-08-27,118.0,...,0,0,1,Fall,1,0,0,0,6536 days,17
3,12 Angry Men (Twelve Angry Men),"A Puerto Rican youth is on trial for murder, a...",NR,Classics,Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,2001-03-06,95.0,...,0,0,0,Spring,0,1,0,0,16033 days,43
4,"20,000 Leagues Under The Sea","This 1954 Disney version of Jules Verne's 20,0...",G,Action & Adventure,Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,2003-05-20,127.0,...,0,0,0,Winter,0,0,0,1,18036 days,49


In [49]:
# Create the training and test datasets
train = data[data['in_theatre_year'] < 2010]
test = data[data['in_theatre_year'] >= 2010]

In [50]:
import pickle
# Pickle the DataFrame
with open('train.pkl', 'wb') as file:
    pickle.dump(train, file)

In [51]:
import pickle
# Pickle the DataFrame
with open('test.pkl', 'wb') as file:
    pickle.dump(test, file)

In [52]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8888 entries, 2 to 14089
Data columns (total 38 columns):
 #   Column                           Non-Null Count  Dtype          
---  ------                           --------------  -----          
 0   movie_title                      8888 non-null   object         
 1   movie_info                       8888 non-null   object         
 2   rating                           8888 non-null   object         
 3   genre                            8888 non-null   object         
 4   directors                        8888 non-null   object         
 5   writers                          8888 non-null   object         
 6   cast                             8888 non-null   object         
 7   in_theaters_date                 8888 non-null   datetime64[ns] 
 8   on_streaming_date                8888 non-null   datetime64[ns] 
 9   runtime_in_minutes               8888 non-null   float64        
 10  studio_name                      8888 non-null   obj

In [53]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5202 entries, 0 to 14087
Data columns (total 38 columns):
 #   Column                           Non-Null Count  Dtype          
---  ------                           --------------  -----          
 0   movie_title                      5202 non-null   object         
 1   movie_info                       5202 non-null   object         
 2   rating                           5202 non-null   object         
 3   genre                            5202 non-null   object         
 4   directors                        5202 non-null   object         
 5   writers                          5202 non-null   object         
 6   cast                             5202 non-null   object         
 7   in_theaters_date                 5202 non-null   datetime64[ns] 
 8   on_streaming_date                5202 non-null   datetime64[ns] 
 9   runtime_in_minutes               5202 non-null   float64        
 10  studio_name                      5202 non-null   obj