In [1]:
#DATA CLEANING
#Handling missing data - handling double rows, handling titles whose value is missing
#adding values for missing matches according to the most common value - only if the number of missing cells in each category is less than 20% 
#turning age and duration into int variables

In [53]:
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
%matplotlib inline
mpl.rcParams['figure.dpi'] = 600
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [54]:
df = pd.read_csv('merged-csv-files.csv', header=0, sep=',',index_col=[0])

In [55]:
df.shape

(5202, 14)

In [56]:
df.head()

Unnamed: 0,Title,Year,Duration,MostLiked,Description,Cast,Genre,About,Director,FullCast,Writer,GenreFull,AboutFull,Age
0,The Adam Project,2022.0,1h 46m,0.0,"After accidentally crash-landing in 2022, time...","['Ryan Reynolds,', 'Mark Ruffalo,', 'Jennifer ...","['Sci-Fi Movies,', 'Family Movies,', 'Action &...","['Witty,', 'Feel-Good,', 'Exciting']",['Shawn Levy'],"['Ryan Reynolds,', 'Mark Ruffalo,', 'Jennifer ...","['Jonathan Tropper,', 'T.S. Nowlin,', 'Jennife...","['Sci-Fi Movies,', 'Family Movies,', 'Action &...","['Witty,', 'Feel-Good,', 'Exciting']",13+
1,Interstellar,2014.0,2h 49m,1.0,With humanity teetering on the brink of extinc...,"['Matthew McConaughey,', 'Anne Hathaway,', 'Je...","['Sci-Fi Movies,', 'Action & Adventure,', 'Dra...","['Mind-Bending,', 'Cerebral']",['Christopher Nolan'],"['Matthew McConaughey,', 'Anne Hathaway,', 'Je...","['Jonathan Nolan,', 'Christopher Nolan']","['Sci-Fi Movies,', 'Action & Adventure,', 'Dra...","['Mind-Bending,', 'Cerebral']",13+
2,Red Notice,2021.0,1h 58m,0.0,An FBI profiler pursuing the world's most want...,"['Dwayne Johnson,', 'Ryan Reynolds,', 'Gal Gad...","['Action & Adventure,', 'Comedies,', 'Crime Mo...","['Irreverent,', 'Exciting']",['Rawson Marshall Thurber'],"['Dwayne Johnson,', 'Ryan Reynolds,', 'Gal Gad...",['Rawson Marshall Thurber'],"['Action & Adventure,', 'Comedies,', 'Crime Mo...","['Irreverent,', 'Exciting']",13+
3,Abduction,2011.0,1h 45m,0.0,When a teen comes across his own childhood pho...,"['Taylor Lautner,', 'Lily Collins,', 'Alfred M...","['Mysteries,', 'Action & Adventure']","['Suspenseful,', 'Exciting']",['John Singleton'],"['Taylor Lautner,', 'Lily Collins,', 'Alfred M...",['Shawn Christensen'],"['Mysteries,', 'Action & Adventure']","['Suspenseful,', 'Exciting']",13+
4,13 Hours: The Secret Soldiers of Benghazi,2016.0,2h 25m,0.0,Members of an elite security team battle to sa...,"['John Krasinski,', 'James Badge Dale,', 'Max ...","['Military Movies,', 'Movies Based on Real Lif...","['Gritty,', 'Exciting']",['Michael Bay'],"['John Krasinski,', 'James Badge Dale,', 'Max ...",['Chuck Hogan'],"['Military Movies,', 'Movies Based on Real Lif...","['Gritty,', 'Exciting']",16+


In [57]:
#check how many NULL in Title column
df.Title.isnull().sum()

63

In [58]:
#delete the rows with NULL value in Title
df.dropna(subset=['Title'],inplace=True)

In [59]:
#check how many NULL in Title column
df.Title.isnull().sum()

0

In [60]:
#delete the column Writer
df.drop('Writer', inplace=True, axis=1)

In [61]:
#delete the column Cast
df.drop('Cast', inplace=True, axis=1)

In [62]:
#delete the column About
df.drop('About', inplace=True, axis=1)

In [63]:
#delete the column Genre
df.drop('Genre', inplace=True, axis=1)

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5139 entries, 0 to 302
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Title        5139 non-null   object 
 1   Year         5139 non-null   float64
 2   Duration     5139 non-null   object 
 3   MostLiked    5139 non-null   float64
 4   Description  5112 non-null   object 
 5   Director     4901 non-null   object 
 6   FullCast     4808 non-null   object 
 7   GenreFull    5113 non-null   object 
 8   AboutFull    4606 non-null   object 
 9   Age          5139 non-null   object 
dtypes: float64(2), object(8)
memory usage: 441.6+ KB


In [65]:
#check how many rows and columns left after delete Titles with NULL value
df.shape

(5139, 10)

In [66]:
#check if there is duplicated rows
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
298     True
299     True
300    False
301    False
302    False
Length: 5139, dtype: bool

In [67]:
#show the duplicated rows by the same *title*
df[df.duplicated(['Title','Year'])]

Unnamed: 0,Title,Year,Duration,MostLiked,Description,Director,FullCast,GenreFull,AboutFull,Age
482,Wild Dog,2020.0,2h 6m,0.0,A brash but brilliant Indian intelligence agen...,['Ahishor Solomon'],"['Nagarjuna Akkineni,', 'Dia Mirza,', 'Saiyami...","['Indian,', 'Kannada Movies & TV,', 'Action & ...",['Exciting'],16+
491,Wild Dog,2020.0,2h 6m,0.0,A brash but brilliant Indian intelligence agen...,['Ahishor Solomon'],"['Nagarjuna Akkineni,', 'Dia Mirza,', 'Saiyami...","['Indian,', 'Tamil-Language Movies,', 'Action ...",['Exciting'],16+
502,Wild Dog,2020.0,2h 6m,0.0,A brash but brilliant Indian intelligence agen...,['Ahishor Solomon'],"['Nagarjuna Akkineni,', 'Dia Mirza,', 'Saiyami...","['Indian,', 'Malayalam-Language Movies,', 'Act...",['Exciting'],16+
6,Princess Mononoke,1997.0,2h 14m,1.0,A prince infected with a lethal curse sets off...,['Hayao Miyazaki'],"['Yoji Matsuda,', 'Yuriko Ishida,', 'Yuko Tana...","['Japanese,', 'Family Movies,', 'Fantasy Anime...",,13+
20,The Witcher: Nightmare of the Wolf,2021.0,1h 23m,0.0,From the creative minds behind the blockbuster...,['Han Kwang Il'],"['Theo James,', 'Mary McDonnell,', 'Lara Pulve...","['TV Shows Based on Books,', 'Movies Based on ...","['Suspenseful,', 'Exciting']",18+
...,...,...,...,...,...,...,...,...,...,...
296,In Paradox,2019.0,1h 33m,0.0,"On the run from assailants, a man desperately ...",['Hamad AlSarraf'],"['Faisal Al Omairi,', 'Jafra Younes,', 'Samr I...","['Kuwaiti,', 'Mysteries,', 'Middle Eastern Mov...","['Mind-Bending,', 'Ominous']",13+
297,Krishna Cottage,2004.0,2h 3m,0.0,True love is put to the test when another woma...,['Santram Varma'],"['Sohail Khan,', 'Isha Koppikar,', 'Natassha,'...","['Indian,', 'Hindi-Language Movies,', 'Bollywo...","['Scary,', 'Romantic']",16+
298,Albert Pinto Ko Gussa Kyun Aata Hai?,2019.0,1h 25m,0.0,"As the police investigate his disappearance, a...",['Soumitra Ranade'],"['Nandita Das,', 'Manav Kaul,', 'Saurabh Shukl...","['Social Issue Dramas,', 'Indian,', 'Hindi-Lan...","['Cerebral,', 'Dark']",16+
299,Muqaddar ka Faisla,1987.0,2h 51m,0.0,God-fearing Pandit Krishna Kant loses everythi...,['Prakash Mehra'],"['Raaj Kumar,', 'Rakhee Gulzar,', 'Raj Babbar,...","['Indian,', 'Hindi-Language Movies,', 'Bollywo...","['Emotional,', 'Exciting']",13+


In [68]:
#delete the duplicated movies.
df.drop_duplicates(subset=['Title','Year'], keep='first', inplace=True)

In [69]:
df.describe(include='all')

Unnamed: 0,Title,Year,Duration,MostLiked,Description,Director,FullCast,GenreFull,AboutFull,Age
count,3297,3297.0,3297,3297.0,3270,3083,3057,3271,2862,3297
unique,3278,,218,,3229,2244,2872,1569,486,5
top,Nan,,1h 37m,,This thriller traces a now-infamous fugitive's...,Nan,"['Vatsal Dubey,', 'Julie Tejwani,', 'Rupa Bhim...","['Stand-Up Comedy,', 'Comedies']",['Exciting'],16+
freq,7,,78,,5,41,13,222,211,1480
mean,,2015.904762,,0.018805,,,,,,
std,,7.397563,,0.135856,,,,,,
min,,1945.0,,0.0,,,,,,
25%,,2015.0,,0.0,,,,,,
50%,,2018.0,,0.0,,,,,,
75%,,2020.0,,0.0,,,,,,


In [70]:
#change the Titles to index
df = df.set_index('Title')

In [71]:
#remove '+', 'ALL' from Age column 
df['Age'] = df['Age'].str.replace('+','')
df['Age'] = df['Age'].str.replace('ALL', '0')

In [72]:
#check the types
df.Age.unique()

array(['13', '16', '18', '7', '0'], dtype=object)

In [73]:
#cheage the Age from string to int
df['Age'] = df['Age'].apply(int)

In [74]:
df.Age.unique()

array([13, 16, 18,  7,  0], dtype=int64)

In [75]:
#how many NULL for each category
df.isnull().sum()

Year             0
Duration         0
MostLiked        0
Description     27
Director       214
FullCast       240
GenreFull       26
AboutFull      435
Age              0
dtype: int64

In [76]:
df = df.dropna(subset=['FullCast'])

In [77]:
df = df.dropna(subset=['AboutFull'])

In [78]:
df = df.dropna(subset=['GenreFull'])

In [79]:
df = df.dropna(subset=['Director'])

In [80]:
#how many NULL for each category
df.isnull().sum()

Year           0
Duration       0
MostLiked      0
Description    0
Director       0
FullCast       0
GenreFull      0
AboutFull      0
Age            0
dtype: int64

In [81]:
#check the paramers frequency in Age
df.Age.describe()

count    2651.000000
mean       13.186345
std         3.915231
min         0.000000
25%        13.000000
50%        13.000000
75%        16.000000
max        18.000000
Name: Age, dtype: float64

In [82]:
#what is the amount of each parameter
df.Age.value_counts()

16    1168
13    1005
7      324
0      102
18      52
Name: Age, dtype: int64

In [83]:
#replace the cells that have a NULL value with the most common value
df.Age = df.Age.fillna(df.Age.mode()[0])

In [84]:
df.Director = df.Director.fillna(df.Director.mode()[0])

In [85]:
df.Duration.describe()

count       2651
unique       191
top       1h 37m
freq          68
Name: Duration, dtype: object

In [86]:
df['Duration'].unique()

array(['1h 46m', '2h 49m', '1h 58m', '1h 45m', '2h 25m', '2h 3m',
       '1h 49m', '1h 29m', '1h 38m', '1h 59m', '2h 2m', '1h 47m',
       '1h 57m', '2h 1m', '1h 56m', '2h 38m', '1h 52m', '2h 19m',
       '2h 16m', '1h 53m', '2h', '2h 7m', '2h 24m', '2h 4m', '2h 12m',
       '1h 44m', '1h 31m', '2h 15m', '2h 9m', '2h 8m', '1h 43m', '1h 37m',
       '1h 28m', '2h 13m', '2h 26m', '2h 5m', '1h 30m', '1h 54m',
       '2h 17m', '1h 48m', '1h 42m', '2h 32m', '2h 21m', '1h 35m',
       '1h 41m', '1h 51m', '1h 50m', '2h 36m', '2h 11m', '1h 40m',
       '2h 18m', '2h 22m', '2h 35m', '2h 28m', '1h 34m', '1h 27m',
       '2h 50m', '3h 7m', '1h 39m', '1h 32m', '2h 23m', '1h 55m',
       '1h 36m', '1h 26m', '2h 20m', '1h 22m', '2h 44m', '3h 5m',
       '1h 21m', '2h 29m', '2h 10m', '2h 46m', '1h 23m', '1h 33m',
       '1h 20m', '2h 39m', '2h 48m', '19m', '2h 14m', '1h 14m', '1h 25m',
       '2h 42m', '2h 51m', '2h 40m', '2h 27m', '58m', '2h 41m', '2h 6m',
       '3h 14m', '1h 9m', '3h 33m', '2h 34m

In [87]:
df['Duration'] = df['Duration'].str.replace('h', ' ')
df['Duration'] = df['Duration'].str.replace('m', '')

In [88]:
#Deleting exceptional data in Duration

In [89]:
df= df[df["Duration"].str.contains("Seasons") == False]

In [90]:
df= df[df["Duration"].str.contains("Episode") == False]

In [91]:
df= df[df["Duration"].str.contains("Season") == False]

In [92]:
df= df[df["Duration"].str.contains("Episodes") == False]

In [93]:
df= df[df["Duration"].str.contains("Collections") == False]

In [94]:
df= df[df["Duration"].str.contains("Volumes") == False]

In [95]:
df= df[df["Duration"].str.contains("Parts") == False]

In [96]:
df= df[df["Duration"].str.contains("Volues") == False]

In [97]:
df= df[df["Duration"].str.contains("Liited Series") == False]

In [98]:
df['Duration'].unique()

array(['1  46', '2  49', '1  58', '1  45', '2  25', '2  3', '1  49',
       '1  29', '1  38', '1  59', '2  2', '1  47', '1  57', '2  1',
       '1  56', '2  38', '1  52', '2  19', '2  16', '1  53', '2 ', '2  7',
       '2  24', '2  4', '2  12', '1  44', '1  31', '2  15', '2  9',
       '2  8', '1  43', '1  37', '1  28', '2  13', '2  26', '2  5',
       '1  30', '1  54', '2  17', '1  48', '1  42', '2  32', '2  21',
       '1  35', '1  41', '1  51', '1  50', '2  36', '2  11', '1  40',
       '2  18', '2  22', '2  35', '2  28', '1  34', '1  27', '2  50',
       '3  7', '1  39', '1  32', '2  23', '1  55', '1  36', '1  26',
       '2  20', '1  22', '2  44', '3  5', '1  21', '2  29', '2  10',
       '2  46', '1  23', '1  33', '1  20', '2  39', '2  48', '19',
       '2  14', '1  14', '1  25', '2  42', '2  51', '2  40', '2  27',
       '58', '2  41', '2  6', '3  14', '1  9', '3  33', '2  34', '1  13',
       '2  33', '2  30', '3 ', '2  43', '2  57', '3  1', '3  23', '3  31',
       '3  4', '3 

In [99]:
j=0
for i in df['Duration']:
    if(len(i) == 2 or len(i) == 1):
        df['Duration'][j] = i
    else:
        splintDur = i.split(' ')
    if(len(splintDur) == 3):
        int1 = splintDur[0]
        int2 = splintDur[2]
        int1 = int(int1)
        int2 = int(int2)
        amount = int1 * 60 + int2
        df['Duration'][j] = amount
        j+=1    

In [100]:
df.tail(50)

Unnamed: 0_level_0,Year,Duration,MostLiked,Description,Director,FullCast,GenreFull,AboutFull,Age
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Fever Dream,2021.0,93,0.0,The charged relationship between two young mom...,['Claudia Llosa'],"['María Valverde,', 'Dolores Fonzi,', 'Germán ...","['Movies Based on Books,', 'Dramas,', 'Thrille...",['Suspenseful'],13
Dhamaka,2021.0,104,0.0,When a cynical ex-TV news anchor gets an alarm...,['Ram Madhvani'],"['Kartik Aaryan,', 'Amruta Subhash,', 'Mrunal ...","['Indian,', 'Hindi-Language Movies,', 'Bollywo...",Nan,13
A Sort of Family,2017.0,96,0.0,An Argentine doctor faces legal and ethical ch...,['Diego Lerman'],"['Bárbara Lennie,', 'Daniel Aráoz,', 'Claudio ...","['Argentinian,', 'Social Issue Dramas,', 'Dram...",['Suspenseful'],13
Unsane,2018.0,97,0.0,After moving away from home to escape a stalke...,['Steven Soderbergh'],"['Claire Foy,', 'Joshua Leonard,', 'Jay Pharoa...",['Thriller Movies'],Nan,16
Sin City,2019.0,101,0.0,A busy couple tries to give their love life a ...,['Pascal Amanfo'],"['Yvonne Nelson,', 'Kunle Remi,', 'Adjetey Ana...","['Mysteries,', 'African Movies,', 'Thriller Mo...",['Steamy'],16
Photocopier,2022.0,130,0.0,When photos of her at a party cause her to los...,['Wregas Bhanuteja'],"['Shenina Cinnamon,', 'Chicco Kurniawan,', 'Je...","['Indonesian,', 'Mysteries,', 'Social Issue Dr...",['Suspenseful'],16
Anatomy,2000.0,99,0.0,A young medical student uncovers a horrifying ...,['Stephan Ruzowitzky'],"['Franka Potente,', 'Benno Fürmann,', 'Anna Lo...","['German,', 'Thriller Movies,', 'Horror Movies']",['Suspenseful'],16
Ammar,2020.0,83,0.0,"When a family moves into an old castle, excite...",['Mahmoud Kamel'],"['Sharif Salamah,', 'Eman Al Assy,', 'Hala El-...","['Egyptian,', 'Middle Eastern Movies,', 'Thril...","['Ominous,', 'Scary,', 'Suspenseful']",16
The Intruders,2015.0,92,0.0,"After her mother's death, a troubled girl and ...",['Adam Massey'],"['Miranda Cosgrove,', 'Donal Logue,', 'Austin ...","['Canadian,', 'Mysteries,', 'Thriller Movies']","['Chilling,', 'Scary']",16
Mercy,2016.0,87,0.0,Two brothers clash with their half-siblings wh...,['Chris Sparling'],"['James Wolk,', 'Caitlin Fitzgerald,', 'Tom Li...","['Dramas,', 'Thriller Movies']","['Dark,', 'Suspenseful']",16


In [101]:
#cheage the Duration from string to int
df['Duration'] = df['Duration'].apply(int)

In [102]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2630 entries, The Adam Project to Boomika (Telugu)
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Year         2630 non-null   float64
 1   Duration     2630 non-null   int64  
 2   MostLiked    2630 non-null   float64
 3   Description  2630 non-null   object 
 4   Director     2630 non-null   object 
 5   FullCast     2630 non-null   object 
 6   GenreFull    2630 non-null   object 
 7   AboutFull    2630 non-null   object 
 8   Age          2630 non-null   int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 270.0+ KB


In [103]:
#Save a new CSV file after data cleaning
df.to_csv("DF_After_DataCleaning.csv")