Here I do cleaning of Netflix dataset

In [1]:
import pandas as pd
import numpy as np
import os

os.chdir('C:/Users/nikit/data_analysis/netflix/datasets')

with open('netflix1.csv','r', encoding='utf8') as file:
    netflix = pd.read_csv(file, index_col='show_id')

Some funcs to make my life easier

In [2]:
def get_null(df: pd.DataFrame) -> pd.DataFrame:
    null_mask = df.isin(['Not Given']).any(axis=1)
    null_frame = df[null_mask]
    df_not_null = df[null_mask == False]
    
    return null_frame, df_not_null

def print_missing(df: pd.DataFrame) -> None:
    print('Missing values:')
    print(df.isin(['Not Given']).sum())
    print('Total missing values:', df.isin(['Not Given']).sum().sum())

Let's see how many rows are missing

In [3]:
print('Initial nulls:')
print_missing(netflix)

Initial nulls:
Missing values:
type               0
title              0
director        2588
country          287
date_added         0
release_year       0
rating             0
duration           0
listed_in          0
dtype: int64
Total missing values: 2875


That's around 30% so I can't drop them right now. I'll try to treat them. 

- As first step, I will look for similarities in titles (in first two words)
- Then, I will treat missing countries firtsly by similarities in type, director, and rating and secondly by director only

In [5]:
null_frame, netflix_not_null = get_null(netflix)
split_title = netflix_not_null['title'].str.replace(r'(\S+)\s(\S+).*', r'\1 \2', regex=True)
split_title = pd.DataFrame(split_title, columns=['title'])
split_title.index.name = 'show_id'

We got first two words of each title in dataset

In [6]:
for id in null_frame.index.tolist():
    row = null_frame.loc[id]
    title = row['title'].split(' ')[:2]
    title = ' '.join(title)
    rep_id = split_title[split_title['title'] == title]
    if not rep_id.empty:
        rep_id = rep_id.index[0]
        rep_row = netflix_not_null.loc[rep_id]
        if row['director'] == 'Not Given':
            netflix.loc[id, 'director'] = rep_row['director']
        if row['country'] == 'Not Given':
            netflix.loc[id, 'country'] = rep_row['country']

In [7]:
print('After treatment by similarities in titles:')
print_missing(netflix)

After treatment by similarities in titles:
Missing values:
type               0
title              0
director        2301
country          235
date_added         0
release_year       0
rating             0
duration           0
listed_in          0
dtype: int64
Total missing values: 2536


We were able to treat about 300 missing directors and 50 missing countries

In [10]:
null_frame, netflix_not_null = get_null(netflix)

# Treatment by other similarities
for id in null_frame.index.tolist():
    row = null_frame.loc[id]

    if row['country'] == 'Not Given':
        similar = netflix_not_null[(netflix_not_null['type'] == row['type'])&
                                    (netflix_not_null['director'] == row['director'])]
        print(similar)
        if not similar.empty:
            suggest = similar['country'].value_counts().idxmax()
            netflix.loc[id, 'country'] = suggest

print('After treatment by other similarities:')
print_missing(netflix)

Empty DataFrame
Columns: [type, title, director, country, date_added, release_year, rating, duration, listed_in]
Index: []
Empty DataFrame
Columns: [type, title, director, country, date_added, release_year, rating, duration, listed_in]
Index: []
Empty DataFrame
Columns: [type, title, director, country, date_added, release_year, rating, duration, listed_in]
Index: []
Empty DataFrame
Columns: [type, title, director, country, date_added, release_year, rating, duration, listed_in]
Index: []
Empty DataFrame
Columns: [type, title, director, country, date_added, release_year, rating, duration, listed_in]
Index: []
Empty DataFrame
Columns: [type, title, director, country, date_added, release_year, rating, duration, listed_in]
Index: []
Empty DataFrame
Columns: [type, title, director, country, date_added, release_year, rating, duration, listed_in]
Index: []
Empty DataFrame
Columns: [type, title, director, country, date_added, release_year, rating, duration, listed_in]
Index: []
Empty DataFrame
