In [213]:
import pandas as pd
import numpy as np

In [214]:
data = pd.read_csv('netflix_titles.csv')

In [215]:
data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s6079,Movie,Abhinetri,A. L. Vijay,"Tamannaah Bhatia, Prabhu Deva, Sonu Sood, Sapt...",India,"May 1, 2018",2016,TV-14,131 min,"Comedies, International Movies, Sci-Fi & Fantasy","Due to family pressure, a corporate man reluct..."
1,s3538,Movie,Watchman,A. L. Vijay,"G.V. Prakash Kumar, Samyuktha Hegde, Suman, Ra...",India,"September 4, 2019",2019,TV-14,93 min,"Comedies, Dramas, International Movies","Rushing to pay off a loan shark, a young man b..."
2,s2390,Movie,Asura Guru,A. Raajdheep,"Vikram Prabhu, Subbaraju, Mahima Nambiar, Yogi...",India,"June 13, 2020",2020,TV-14,117 min,"Dramas, International Movies","For a tech-savvy thief, elaborate robberies an..."
3,s5550,Movie,Salaakhen,A. Salaam,"Shashi Kapoor, Sulakshana Pandit, Mehmood, Sud...",India,"April 1, 2017",1975,TV-14,134 min,"Action & Adventure, International Movies, Musi...",Two close childhood friends take drastically d...
4,s4050,Movie,Sarkar,A.R. Murugadoss,"Vijay, Varalakshmi Sarathkumar, Keerthi Suresh...",,"March 2, 2019",2018,TV-MA,162 min,"Action & Adventure, Dramas, International Movies",A ruthless businessman’s mission to expose ele...


In [216]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [217]:
data['date_added'] = pd.to_datetime(data['date_added'], errors = 'coerce')
columns_to_fill = ['cast', 'director']
data[columns_to_fill] = data[columns_to_fill].fillna('No Data')

In [218]:
# Drop the 'description' column
data.drop(columns=['description'], inplace=True)

In [219]:
def replace_nat_with_release_year_end(data):
    data['date_added'] = pd.to_datetime(data['date_added'], errors='coerce') # Ensure datetime, coerce errors.
    data['release_year'] = pd.to_numeric(data['release_year'], errors='coerce') # Ensure numeric release year.

    nat_dates = data['date_added'].isnull() # Find NaT dates.

    data.loc[nat_dates, 'date_added'] = pd.to_datetime(data.loc[nat_dates, 'release_year'].astype(str) + '-12-31', errors='coerce')

    return data

In [220]:
data = replace_nat_with_release_year_end(data)

In [221]:
data['rating'].unique()

array(['TV-14', 'TV-MA', 'PG-13', 'PG', 'TV-PG', 'NR', 'R', 'NC-17',
       'TV-Y', 'TV-Y7', 'TV-G', nan, 'G', 'TV-Y7-FV', 'UR', '74 min',
       '84 min', '66 min'], dtype=object)

In [222]:
# Filter rows where rating is '74 min' or '84 min'
incorrect_ratings = data[data['rating'].isin(['74 min', '84 min','66 min'])]

# Display the filtered rows
print(incorrect_ratings)


     show_id   type                                 title    director  \
3237   s5542  Movie                       Louis C.K. 2017  Louis C.K.   
3238   s5795  Movie                 Louis C.K.: Hilarious  Louis C.K.   
3239   s5814  Movie  Louis C.K.: Live at the Comedy Store  Louis C.K.   

            cast        country date_added  release_year  rating duration  \
3237  Louis C.K.  United States 2017-04-04          2017  74 min      NaN   
3238  Louis C.K.  United States 2016-09-16          2010  84 min      NaN   
3239  Louis C.K.  United States 2016-08-15          2015  66 min      NaN   

     listed_in  
3237    Movies  
3238    Movies  
3239    Movies  


In [223]:
# Identify rows where rating contains '74 min' or '84 min'
incorrect_ratings = ['74 min', '84 min', '66 min']

# Move incorrect values to 'duration' column
data.loc[data['rating'].isin(incorrect_ratings), 'duration'] = data['rating']

# Replace '74 min' and '84 min' with 'NR' in the 'rating' column
data.loc[data['rating'].isin(incorrect_ratings), 'rating'] = 'NR'

# Display the updated dataset
print(data[['rating', 'duration']].head())

  rating duration
0  TV-14  131 min
1  TV-14   93 min
2  TV-14  117 min
3  TV-14  134 min
4  TV-MA  162 min


In [224]:
# Define mapping for rating standardization
rating_mapping = {
    'TV-G': 'G',  
    'UR': 'NR',  
    'TV-PG': 'PG',  
    'TV-MA': 'R',  
    'NC-17': 'R',  
    'PG-13': 'TV-13',  
    'TV-Y7-FV': 'TV-Y7'
}

# Apply the mapping
data['rating'] = data['rating'].replace(rating_mapping)

# Display unique ratings after cleaning
print(data['rating'].unique())


['TV-14' 'R' 'TV-13' 'PG' 'NR' 'TV-Y' 'TV-Y7' 'G' nan]


In [225]:
# Replace NaN values in 'rating' with 'NR'
data['rating'] = data['rating'].fillna('NR')

# Display unique values after filling NaNs
print(data['rating'].unique())


['TV-14' 'R' 'TV-13' 'PG' 'NR' 'TV-Y' 'TV-Y7' 'G']


In [226]:
data['type'].nunique()

2

In [227]:
data['country'].nunique()

748

In [228]:
print(data['country'].nunique)

<bound method IndexOpsMixin.nunique of 0           India
1           India
2           India
3           India
4             NaN
          ...    
8802        India
8803        Japan
8804          NaN
8805       Mexico
8806    Australia
Name: country, Length: 8807, dtype: object>


In [229]:
# Replace NaN values in 'country' with 'Not Listed'
data['country'] = data['country'].fillna('Not Listed')

In [230]:
data['country_count'] = data['country'].apply(lambda x: 0 if x == "Not Listed" else len(str(x).split(',')))

In [231]:
data.sample(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,country_count
7361,s5142,TV Show,Kantaro: The Sweet Tooth Salaryman,No Data,"Matsuya Onoe, Ren Ishikawa, Hiroyuki Onoue, Ha...",Japan,2017-12-01,2017,PG,1 Season,"International TV Shows, TV Comedies",1
3760,s8336,Movie,The Green Hornet,Michel Gondry,"Seth Rogen, Jay Chou, Cameron Diaz, Tom Wilkin...",United States,2020-04-18,2011,TV-13,119 min,"Action & Adventure, Comedies",1
6642,s6374,TV Show,Braxton Family Values,No Data,"Toni Braxton, Trina Braxton, Traci Braxton, To...",United States,2020-02-15,2014,PG,2 Seasons,Reality TV,1
85,s6272,Movie,Bedtime Stories,Adam Shankman,"Adam Sandler, Keri Russell, Guy Pearce, Russel...",United States,2019-03-05,2008,PG,100 min,"Children & Family Movies, Comedies",1
1036,s7906,Movie,Running Out Of Time,Chris Stokes,"Tasha Smith, RonReaco Lee, Telma Hopkins, Sydn...",United States,2019-08-01,2018,TV-14,88 min,Thrillers,1


In [232]:
# Split genres into individual values and flatten the list
genres = data['listed_in'].dropna().str.split(',').explode().str.strip()

# Count the frequency of each genre
genre_counts = genres.value_counts()

# Display the most frequent genres
print(genre_counts)


International Movies            2752
Dramas                          2427
Comedies                        1674
International TV Shows          1351
Documentaries                    869
Action & Adventure               859
TV Dramas                        763
Independent Movies               756
Children & Family Movies         641
Romantic Movies                  616
TV Comedies                      581
Thrillers                        577
Crime TV Shows                   470
Kids' TV                         451
Docuseries                       395
Music & Musicals                 375
Romantic TV Shows                370
Horror Movies                    357
Stand-Up Comedy                  343
Reality TV                       255
British TV Shows                 253
Sci-Fi & Fantasy                 243
Sports Movies                    219
Anime Series                     176
Spanish-Language TV Shows        174
TV Action & Adventure            168
Korean TV Shows                  151
C

In [233]:
# Extract minutes for movies
data['movie_duration'] = data['duration'].str.extract(r'(\d+)\s*min')[0].astype(float)

# Extract number of seasons for TV shows
data['num_seasons'] = data['duration'].str.extract(r'(\d+)\s*Season')[0].astype(float)

# Fill NaN values with 0 for easier analysis
data['movie_duration'].fillna(0, inplace=True)
data['num_seasons'].fillna(0, inplace=True)

# Drop the original 'duration' column as it's no longer needed
data.drop(columns=['duration'], inplace=True)

In [234]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   show_id         8807 non-null   object        
 1   type            8807 non-null   object        
 2   title           8807 non-null   object        
 3   director        8807 non-null   object        
 4   cast            8807 non-null   object        
 5   country         8807 non-null   object        
 6   date_added      8807 non-null   datetime64[ns]
 7   release_year    8807 non-null   int64         
 8   rating          8807 non-null   object        
 9   listed_in       8807 non-null   object        
 10  country_count   8807 non-null   int64         
 11  movie_duration  8807 non-null   float64       
 12  num_seasons     8807 non-null   float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(8)
memory usage: 894.6+ KB


In [235]:
data.sample(10)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,listed_in,country_count,movie_duration,num_seasons
5195,s5124,Movie,Pottersville,Seth Henrikson,"Michael Shannon, Judy Greer, Thomas Lennon, Ro...",United States,2017-12-15,2017,TV-13,"Comedies, Independent Movies",1,86.0,0.0
2654,s4116,Movie,"Ken Jeong: You Complete Me, Ho",Jon M. Chu,Ken Jeong,United States,2019-02-14,2019,R,Stand-Up Comedy,1,62.0,0.0
7141,s5414,TV Show,Gypsy,No Data,"Naomi Watts, Billy Crudup, Sophie Cookson, Kar...","United States, United Kingdom",2017-06-30,2017,R,TV Dramas,2,0.0,1.0
1669,s4813,Movie,TAU,Federico D'Alessandro,"Maika Monroe, Ed Skrein, Gary Oldman",United States,2018-06-29,2018,R,"Sci-Fi & Fantasy, Thrillers",1,98.0,0.0
1662,s3134,Movie,Don,Farhan Akhtar,"Shah Rukh Khan, Priyanka Chopra, Arjun Rampal,...",India,2019-12-15,2006,TV-14,"Action & Adventure, International Movies",1,169.0,0.0
7642,s7469,TV Show,Miss Rose,No Data,"Roy Chiu, Megan Lai, Paul Hsu, Tia Lee, Chunya...",Taiwan,2016-08-01,2015,R,"International TV Shows, Romantic TV Shows, TV ...",1,0.0,1.0
5837,s1190,Movie,Nate Bargatze: The Greatest Average American,Troy Miller,Nate Bargatze,United States,2021-03-18,2021,G,Stand-Up Comedy,1,60.0,0.0
3353,s2024,Movie,Kaagar,Makarand Mane,"Rinku Rajguru, Shubhankar Tawde, Shashank Shen...",India,2020-09-10,2019,R,"Dramas, International Movies, Romantic Movies",1,130.0,0.0
4404,s955,Movie,The Yeti Adventures,"Pierre Greco, Nancy Florence Savard","Rachelle Lefevre, Noel Fisher, Colm Feore, Jul...",Canada,2021-05-01,2018,PG,"Children & Family Movies, Comedies",1,85.0,0.0
6866,s4264,TV Show,Demon's Path,No Data,"Wai Ai, Jim Chim Sui Man, Kwok-Pong Chan, Feli...",Not Listed,2018-12-22,2018,TV-14,"Crime TV Shows, International TV Shows, TV Dramas",0,0.0,1.0


In [236]:
data['year_added'] = data['date_added'].dt.year.astype(int)
data['month_added'] = data['date_added'].dt.month_name()
data['day_added'] = data['date_added'].dt.day_name()

# Filter data for years between 2000 and 2020
data_2000_2020 = data[(data['year_added'] >= 2000) & (data['year_added'] <= 2020)]

# Filter data for years between 2010 and 2020
data_2010_2020 = data[(data['year_added'] >= 2010) & (data['year_added'] <= 2020)]

In [237]:
len(data_2000_2020)

7309

In [238]:
len(data_2010_2020)

7302

In [239]:
data_2010_2020.sample(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,listed_in,country_count,movie_duration,num_seasons,year_added,month_added,day_added
1836,s4215,Movie,Lionheart,Genevieve Nnaji,"Genevieve Nnaji, Nkem Owoh, Pete Edochie, Onye...",Nigeria,2019-01-04,2018,PG,"Comedies, Dramas, Independent Movies",1,95.0,0.0,2019,January,Friday
6839,s4760,TV Show,Dark Tourist,No Data,No Data,New Zealand,2018-07-20,2018,R,"Docuseries, International TV Shows",1,0.0,1.0,2018,July,Friday
1501,s7115,Movie,Jackpot,Dustin Nguyen,"Chi Tai, Lan Ngoc, Dustin Nguyen, Thu Trang",Vietnam,2018-10-06,2015,TV-14,"Comedies, Dramas, International Movies",1,92.0,0.0,2018,October,Saturday
6488,s3781,TV Show,Arthdal Chronicles,No Data,"Song Joong-ki, Jang Dong-gun, Kim Ji-won, Kim ...",South Korea,2019-06-02,2019,R,"International TV Shows, Korean TV Shows, TV Ac...",1,0.0,1.0,2019,June,Sunday
1838,s8182,Movie,The Adventure Club,Geoff Anderson,"Sam Ashe Arnold, Jakob Davies, Dalila Bela, Ro...",Canada,2017-05-10,2016,TV-Y7,Children & Family Movies,1,88.0,0.0,2017,May,Wednesday


In [240]:
# Save the cleaned dataset (2010-2020) to a CSV file
data_2010_2020.to_csv('netflix_2010_2020_cleaned.csv', index=False)

print("Cleaned dataset saved as 'netflix_2010_2020_cleaned.csv'")


Cleaned dataset saved as 'netflix_2010_2020_cleaned.csv'
