In [1]:
# Task-1 Data Loading & Familiarization






In [18]:
import pandas as pd
from collections import Counter

# Load the dataset
df = pd.read_csv('/home/nashtech/Downloads/archive/netflix_titles.csv')

In [3]:
# Creating a pure function to print dataset info (no. of rows, columns and column names)
def describe_dataset(df: pd.DataFrame) -> dict:
    return{
        "rows": df.shape[0],
        "column": df.shape[1],
        "column_names": df.columns.tolist()
    }

info = describe_dataset(df)
print(info)
    

{'rows': 8807, 'column': 12, 'column_names': ['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description']}


In [4]:
# Task-2  Functional Cleaning & Preprocessing





In [5]:
def remove_incomplete_rows(df: pd.DataFrame) -> pd.DataFrame:
    # Keeping rows where essential fields are not null
    valid_rows = df[df['title'].notnull() & df['type'].notnull() & df['date_added'].notnull()]
    return valid_rows.copy()

In [6]:
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    cleaned_df = df.copy()

    # Strip whitespace from all string columns
    for col in cleaned_df.select_dtypes(include='object').columns:
        cleaned_df[col] = cleaned_df[col].map(lambda x: x.strip() if isinstance(x, str) else x)

    # Specific column transformations
    cleaned_df['type'] = cleaned_df['type'].map(lambda x: x.lower() if isinstance(x, str) else x)
    cleaned_df['title'] = cleaned_df['title'].map(lambda x: x.strip().lower() if isinstance(x, str) else x)

    return cleaned_df


In [7]:
def preprocess_dataset(df: pd.DataFrame) -> pd.DataFrame:
    filtered_df = remove_incomplete_rows(df)
    cleaned_df = normalize_columns(filtered_df)
    return cleaned_df


In [8]:
print(preprocess_dataset(df))

     show_id     type                  title         director  \
0         s1    movie   dick johnson is dead  Kirsten Johnson   
1         s2  tv show          blood & water              NaN   
2         s3  tv show              ganglands  Julien Leclercq   
3         s4  tv show  jailbirds new orleans              NaN   
4         s5  tv show           kota factory              NaN   
...      ...      ...                    ...              ...   
8802   s8803    movie                 zodiac    David Fincher   
8803   s8804  tv show            zombie dumb              NaN   
8804   s8805    movie             zombieland  Ruben Fleischer   
8805   s8806    movie                   zoom     Peter Hewitt   
8806   s8807    movie                 zubaan      Mozez Singh   

                                                   cast        country  \
0                                                   NaN  United States   
1     Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa 

In [9]:
# Task-3 Code Refactoring with FP Principles





In [10]:
# Below it the transformation that has been applied to filter out the tv-shows on from the dataframe

# Here the FP version is more readable and understandable as it is written in a separate function
# and can be used multiple times when needed which also makes it modular

In [11]:
#Procedural snippet

# Create an empty list to collect valid rows
filtered_rows = []

# Loop through each row using iterrows()
for _, row in df.iterrows():
    if row['type'] == 'TV Show' and pd.notnull(row['title']):
        filtered_rows.append(row)

# -------------------------------------------------------------------------------------------------------------------------

#FP version
def is_valid_tv_show(row):
    return row['type'] == 'TV Show' and pd.notnull(row['title'])

# Filter rows
filtered_rows = list(filter(is_valid_tv_show, df.to_dict('records')))

# Convert back to DataFrame
filtered_df = pd.DataFrame(filtered_rows)

print(filtered_df)

     show_id     type                  title         director  \
0         s2  TV Show          Blood & Water              NaN   
1         s3  TV Show              Ganglands  Julien Leclercq   
2         s4  TV Show  Jailbirds New Orleans              NaN   
3         s5  TV Show           Kota Factory              NaN   
4         s6  TV Show          Midnight Mass    Mike Flanagan   
...      ...      ...                    ...              ...   
2671   s8796  TV Show        Yu-Gi-Oh! Arc-V              NaN   
2672   s8797  TV Show             Yunus Emre              NaN   
2673   s8798  TV Show              Zak Storm              NaN   
2674   s8801  TV Show     Zindagi Gulzar Hai              NaN   
2675   s8804  TV Show            Zombie Dumb              NaN   

                                                   cast  \
0     Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   
1     Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...   
2                                         

In [12]:
# Task - 4 (optional)  Mini Analysis
# - Total TV Shows
# - Avg. cast members
# - Top 5 genres



In [None]:
total_tv_shows = len(filtered_df)


In [16]:
# Use map to count cast members where cast is not null
cast_counts = list(
    map(lambda row: len(row['cast'].split(','))
        if pd.notnull(row['cast']) else 0,
        filtered_df.to_dict('records'))
)

average_cast = sum(cast_counts) / len(cast_counts)


In [19]:


# Flatten all genres into a list
all_genres = [
    genre.strip()
    for row in filtered_df.to_dict('records')
    if pd.notnull(row['listed_in'])
    for genre in row['listed_in'].split(',')
]

genre_counts = Counter(all_genres)
most_common_genres = genre_counts.most_common(5)


In [17]:
print(average_cast)

7.343423019431988
