In [2]:
import pandas as pd
from typing import List, Dict, Any, Tuple
import os

In [3]:
# --- Data Loading & Familiarization ---
def load_dataset(filepath: str) -> pd.DataFrame:
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Dataset not found at path: {filepath}")
    return pd.read_csv(filepath)

def get_dataset_info(df: pd.DataFrame) -> Dict[str, Any]:
    """Pure function: Returns dataset structure info"""
    return {
        'num_rows': df.shape[0],
        'num_columns': df.shape[1],
        'columns': list(df.columns),
        'dtypes': df.dtypes.to_dict(),
        'nulls': df.isnull().sum().to_dict(),
    }

In [11]:
# --- Functional Cleaning & Preprocessing ---

def remove_incomplete_rows(df: pd.DataFrame, required_cols: List[str]) -> pd.DataFrame:
    """Filter out rows with nulls in required columns."""
    is_valid = lambda row: all(pd.notnull(row[col]) for col in required_cols)
    filtered_data = list(filter(is_valid, df.to_dict(orient='records')))
    return pd.DataFrame(filtered_data)

def normalize_text_columns(df: pd.DataFrame, text_cols: List[str]) -> pd.DataFrame:
    def normalize(text: str) -> str:
        return text.strip().lower() if isinstance(text, str) else text

    # Applying transformation using map and list comprehensions
    records = df.to_dict(orient='records')
    normalized_records = list(map(lambda row: {
        col: normalize(row[col]) if col in text_cols else row[col]
        for col in row
    }, records))
    return pd.DataFrame(normalized_records)

In [12]:
# --- FP Refactoring Example ---

def remove_null_titles(df: pd.DataFrame) -> pd.DataFrame:
    return pd.DataFrame(filter(lambda row: pd.notnull(row['title']), df.to_dict(orient='records')))

In [13]:
# --- Mini Analysis ---

def get_top_categories(df: pd.DataFrame, col: str, top_n: int = 5) -> List[Tuple[str, int]]:
    """FP-style group count using value_counts and map"""
    return list(df[col].value_counts().head(top_n).items())

In [17]:
# --- Example Usage ---
if __name__ == "__main__":
    file_path = "netflix_titles.csv"
    try:
        raw_df = load_dataset(file_path)

        info = get_dataset_info(raw_df)
        print("Dataset Info:", info)
        print("Dataset Info:", raw_df)
        df_clean = remove_incomplete_rows(raw_df, required_cols=['title', 'type'])
        print("Remove incomplete Dataset Info:", df_clean)
        df_clean = normalize_text_columns(df_clean, text_cols=['title', 'type'])
        print("Normalized Dataset Info:", df_clean)
        df_clean = remove_null_titles(df_clean)
        print("Remove null Dataset Info:", df_clean)

        print("Top 5 Types:", get_top_categories(df_clean, 'type'))

    except FileNotFoundError as e:
        print(e)
        print("Please make sure the dataset file exists in the correct directory.")


Dataset Info: {'num_rows': 5, 'num_columns': 5, 'columns': ['show_id', 'type', 'title', 'director', 'country'], 'dtypes': {'show_id': dtype('O'), 'type': dtype('O'), 'title': dtype('O'), 'director': dtype('O'), 'country': dtype('O')}, 'nulls': {'show_id': 0, 'type': 1, 'title': 0, 'director': 1, 'country': 0}}
Dataset Info:   show_id     type            title             director        country
0      s1    Movie        Inception    Christopher Nolan  United States
1      s2  TV Show     Breaking Bad       Vince Gilligan  United States
2      s3    Movie     The Irishman      Martin Scorsese  United States
3      s4      NaN                                   NaN  United States
4      s5  TV Show  Stranger Things  The Duffer Brothers  United States
Remove incomplete Dataset Info:   show_id     type            title             director        country
0      s1    Movie        Inception    Christopher Nolan  United States
1      s2  TV Show     Breaking Bad       Vince Gilligan  United S

Dataset Info:   show_id     type            title             director        country
0      s1    Movie        Inception    Christopher Nolan  United States
1      s2  TV Show     Breaking Bad       Vince Gilligan  United States
2      s3    Movie     The Irishman      Martin Scorsese  United States
3      s4      NaN                                   NaN  United States
4      s5  TV Show  Stranger Things  The Duffer Brothers  United States
