# Data Cleansing
    following clean up task is done
        - rename columns
        - drop not needed columns 'director_name'
        - handle null (nan) data for 'duration'
        - handle duplicated rows
        - change columns data 'title_year' and 'movie_title'

#### Loading libraries & dataset

In [1]:
import pandas as pd
df_imdb = pd.read_csv('../../datasets/imdb_movie_cleanup.csv')
df_imdb.head(3)

Unnamed: 0,Movie title,director name,Duration,title Year
0,Avatar,James Cameron,178.0,2009.0
1,Pirates of the Caribbean: At World's End,Gore Verbinski,169.0,2007.0
2,Spectre,Sam Mendes,148.0,2015.0


### Rename columns

In [2]:
# put all columns name to lower case, then replace ' ' with '_'
df_imdb.columns =df_imdb.columns.str.lower().str.replace(' ','_')
df_imdb.head(3)

Unnamed: 0,movie_title,director_name,duration,title_year
0,Avatar,James Cameron,178.0,2009.0
1,Pirates of the Caribbean: At World's End,Gore Verbinski,169.0,2007.0
2,Spectre,Sam Mendes,148.0,2015.0


### Drop columns
    - axis 1 means column
    - axis 0 means row

In [3]:
drop_columns = ['director_name']
df_imdb.drop(drop_columns, axis=1, inplace=True)
df_imdb.head(3)

Unnamed: 0,movie_title,duration,title_year
0,Avatar,178.0,2009.0
1,Pirates of the Caribbean: At World's End,169.0,2007.0
2,Spectre,148.0,2015.0


### Handle missing data
    - Missing data means not having any value at a specific cell inside dataset.
    - To detect missing data, panadas provides 2 functions, isnull() and isnan(), which are same.

In [4]:
df_imdb.head()

Unnamed: 0,movie_title,duration,title_year
0,Avatar,178.0,2009.0
1,Pirates of the Caribbean: At World's End,169.0,2007.0
2,Spectre,148.0,2015.0
3,The Dark Knight Rises,164.0,2012.0
4,Star Wars: Episode VII - The Force Awakens ...,,


#### detect and count missing data

In [5]:
df_imdb.isnull().head()

Unnamed: 0,movie_title,duration,title_year
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,True,True


#### sum() on true, false table will return total count of null

In [6]:
df_imdb.isnull().sum()

movie_title      0
duration        15
title_year     108
dtype: int64

#### recheck the rows which missing data with filter to show null rows

In [7]:
df_imdb[df_imdb['duration'].isnull()]

Unnamed: 0,movie_title,duration,title_year
4,Star Wars: Episode VII - The Force Awakens ...,,
199,Harry Potter and the Deathly Hallows: Part II,,2011.0
206,Harry Potter and the Deathly Hallows: Part I,,2010.0
1510,Black Water Transit,,2009.0
3604,War & Peace,,
3815,Should've Been Romeo,,2012.0
3834,Barfi,,2013.0
4299,Hum To Mohabbat Karega,,2000.0
4392,N-Secure,,2010.0
4397,Dil Jo Bhi Kahey...,,2005.0


### Method 1 : want to remove entire missing data

use notnull()

In [8]:
df_imdb = df_imdb[df_imdb['duration'].notnull()]

or use isnull() == False

In [9]:
df_imdb = df_imdb[df_imdb['duration'].isnull()==False]

recheck after remove all missing data row

In [10]:
df_imdb.isnull().sum()

movie_title      0
duration         0
title_year     105
dtype: int64

### Method 2 : fill missing data
    - calculate the mean
    - df_imdb.mean()
    - fill null value with mean


In [11]:
df_imdb.fillna(df_imdb.mean(), inplace=True)
df_imdb.head()

Unnamed: 0,movie_title,duration,title_year
0,Avatar,178.0,2009.0
1,Pirates of the Caribbean: At World's End,169.0,2007.0
2,Spectre,148.0,2015.0
3,The Dark Knight Rises,164.0,2012.0
5,John Carter,132.0,2012.0


### Handle duplicate data

In [12]:
# detect duplicate data
# subset=None mean all columns are in counting
df_imdb.duplicated(subset=None).head(3)

0    False
1    False
2    False
dtype: bool

In [13]:
# count duplicate data
df_imdb.duplicated().sum()

124

In [14]:
# reconfirm duplicate by show it on dataset
df_duplicated = df_imdb[df_imdb.duplicated(keep=False)].sort_values('movie_title')
df_duplicated

Unnamed: 0,movie_title,duration,title_year
3711,"20,000 Leagues Under the Sea",127.0,1954.0
4894,"20,000 Leagues Under the Sea",127.0,1954.0
4950,A Dog's Breakfast,88.0,2007.0
4949,A Dog's Breakfast,88.0,2007.0
1420,A Nightmare on Elm Street,101.0,1984.0
...,...,...,...
1305,Victor Frankenstein,110.0,2015.0
1146,Victor Frankenstein,110.0,2015.0
2099,Victor Frankenstein,110.0,2015.0
2169,Wicker Park,114.0,2004.0


In [15]:
# remove duplicated and keep only first
# subset=None mean check duplacated on all columns
df_imdb.drop_duplicates(keep='first', inplace=True)
df_imdb.duplicated().sum()

0

### Modify column data

In [16]:
# remove .0 from data inside column 'title_year'
# note that if still have NAN value, apply will not work
import math
def cleanup_title_year(title_year):
    if not math.isnan(title_year):
        return int(title_year)
    else:
        return int(math.nan)
df_imdb.title_year = df_imdb.title_year.apply(cleanup_title_year)
# df_imdb.title_year.astype(int)
df_imdb.head()

Unnamed: 0,movie_title,duration,title_year
0,Avatar,178.0,2009
1,Pirates of the Caribbean: At World's End,169.0,2007
2,Spectre,148.0,2015
3,The Dark Knight Rises,164.0,2012
5,John Carter,132.0,2012
