# DS637 â€” Homework 5 (msleep)
**Student:** Umair Ali

Tasks:
1. Load `msleep.csv` into a DataFrame.
2. Split into **good** (0 NaN), **bad** (exactly 1 NaN), **ugly** (2+ NaN) per row.
3. Fill NaN in **bad** using column mean (numeric) or mode (categorical).
4. On **good**, convert `order` to dummies with prefix `order_`.
5. On **good**, cut `bodywt` into 10 bins and show counts.
6. On **good**, cap `bodywt` at max 100.
7. On filled **bad**, cut `bodywt` into 10 bins and show counts.


## 1) Load data

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# If msleep.csv is in the same folder as this notebook:
data_path = Path('msleep.csv')

df = pd.read_csv(data_path)
print('Shape:', df.shape)
display(df.head())

Shape: (83, 11)


Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
0,Cheetah,Acinonyx,carni,Carnivora,lc,12.1,,,11.9,,50.0
1,Owl monkey,Aotus,omni,Primates,,17.0,1.8,,7.0,0.0155,0.48
2,Mountain beaver,Aplodontia,herbi,Rodentia,nt,14.4,2.4,,9.6,,1.35
3,Greater short-tailed shrew,Blarina,omni,Soricomorpha,lc,14.9,2.3,0.133333,9.1,0.00029,0.019
4,Cow,Bos,herbi,Artiodactyla,domesticated,4.0,0.7,0.666667,20.0,0.423,600.0


## 2) Split into good / bad / ugly based on NaN count per row

In [2]:
nan_per_row = df.isna().sum(axis=1)

good = df.loc[nan_per_row == 0].copy()
bad  = df.loc[nan_per_row == 1].copy()
ugly = df.loc[nan_per_row >= 2].copy()

print('good rows (0 NaN):', len(good))
print('bad rows  (1 NaN):', len(bad))
print('ugly rows (2+ NaN):', len(ugly))

display(good.head())
display(bad.head())
display(ugly.head())

good rows (0 NaN): 20
bad rows  (1 NaN): 19
ugly rows (2+ NaN): 44


Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
3,Greater short-tailed shrew,Blarina,omni,Soricomorpha,lc,14.9,2.3,0.133333,9.1,0.00029,0.019
4,Cow,Bos,herbi,Artiodactyla,domesticated,4.0,0.7,0.666667,20.0,0.423,600.0
8,Dog,Canis,carni,Carnivora,domesticated,10.1,2.9,0.333333,13.9,0.07,14.0
11,Guinea pig,Cavis,herbi,Rodentia,domesticated,9.4,0.8,0.216667,14.6,0.0055,0.728
13,Chinchilla,Chinchilla,herbi,Rodentia,domesticated,12.5,1.5,0.116667,11.5,0.0064,0.42


Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
6,Northern fur seal,Callorhinus,carni,Carnivora,vu,8.7,1.4,0.383333,15.3,,20.49
10,Goat,Capri,herbi,Artiodactyla,lc,5.3,0.6,,18.7,0.115,33.5
14,Star-nosed mole,Condylura,omni,Soricomorpha,lc,10.3,2.2,,13.7,0.001,0.06
18,Tree hyrax,Dendrohyrax,herbi,Hyracoidea,lc,5.3,0.5,,18.7,0.0123,2.95
23,Donkey,Equus,herbi,Perissodactyla,domesticated,3.1,0.4,,20.9,0.419,187.0


Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
0,Cheetah,Acinonyx,carni,Carnivora,lc,12.1,,,11.9,,50.0
1,Owl monkey,Aotus,omni,Primates,,17.0,1.8,,7.0,0.0155,0.48
2,Mountain beaver,Aplodontia,herbi,Rodentia,nt,14.4,2.4,,9.6,,1.35
5,Three-toed sloth,Bradypus,herbi,Pilosa,,14.4,2.2,0.766667,9.6,,3.85
7,Vesper mouse,Calomys,,Rodentia,,7.0,,,17.0,,0.045


## 3) Fill NaN in the *bad* dataframe (mean for numeric, mode for categorical)

In [3]:
fill_values = {}
for col in df.columns:
    if pd.api.types.is_numeric_dtype(df[col]):
        fill_values[col] = df[col].mean(skipna=True)
    else:
        modes = df[col].mode(dropna=True)
        fill_values[col] = modes.iloc[0] if len(modes) else None

filled_bad = bad.copy().fillna(value=fill_values)

print('NaNs remaining in filled_bad:', int(filled_bad.isna().sum().sum()))
display(filled_bad.head())

NaNs remaining in filled_bad: 0


Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
6,Northern fur seal,Callorhinus,carni,Carnivora,vu,8.7,1.4,0.383333,15.3,0.281581,20.49
10,Goat,Capri,herbi,Artiodactyla,lc,5.3,0.6,0.439583,18.7,0.115,33.5
14,Star-nosed mole,Condylura,omni,Soricomorpha,lc,10.3,2.2,0.439583,13.7,0.001,0.06
18,Tree hyrax,Dendrohyrax,herbi,Hyracoidea,lc,5.3,0.5,0.439583,18.7,0.0123,2.95
23,Donkey,Equus,herbi,Perissodactyla,domesticated,3.1,0.4,0.439583,20.9,0.419,187.0


## 4) On *good*, convert column `order` into dummies with prefix `order_`

In [4]:
good_dummies = good.copy()

order_dummies = pd.get_dummies(good_dummies['order'], prefix='order')
good_dummies = pd.concat([good_dummies.drop(columns=['order']), order_dummies], axis=1)

print('Original good shape:', good.shape)
print('After dummies shape:', good_dummies.shape)
display(good_dummies.head())

Original good shape: (20, 11)
After dummies shape: (20, 20)


Unnamed: 0,name,genus,vore,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt,order_Artiodactyla,order_Carnivora,order_Chiroptera,order_Cingulata,order_Didelphimorphia,order_Erinaceomorpha,order_Lagomorpha,order_Perissodactyla,order_Rodentia,order_Soricomorpha
3,Greater short-tailed shrew,Blarina,omni,lc,14.9,2.3,0.133333,9.1,0.00029,0.019,False,False,False,False,False,False,False,False,False,True
4,Cow,Bos,herbi,domesticated,4.0,0.7,0.666667,20.0,0.423,600.0,True,False,False,False,False,False,False,False,False,False
8,Dog,Canis,carni,domesticated,10.1,2.9,0.333333,13.9,0.07,14.0,False,True,False,False,False,False,False,False,False,False
11,Guinea pig,Cavis,herbi,domesticated,9.4,0.8,0.216667,14.6,0.0055,0.728,False,False,False,False,False,False,False,False,True,False
13,Chinchilla,Chinchilla,herbi,domesticated,12.5,1.5,0.116667,11.5,0.0064,0.42,False,False,False,False,False,False,False,False,True,False


## 5) On *good*, cut `bodywt` into 10 bins and return counts

In [5]:
good_bodywt = pd.to_numeric(good['bodywt'], errors='coerce')

bins_good = pd.cut(good_bodywt, bins=10)
counts_good = bins_good.value_counts().sort_index()

display(counts_good)

bodywt
(-0.595, 60.004]      16
(60.004, 120.004]      1
(120.004, 180.003]     0
(180.003, 240.003]     1
(240.003, 300.002]     0
(300.002, 360.002]     0
(360.002, 420.001]     0
(420.001, 480.001]     0
(480.001, 540.0]       1
(540.0, 600.0]         1
Name: count, dtype: int64

## 6) On *good*, cap `bodywt` to 100 max

In [6]:
good_capped = good.copy()
good_capped['bodywt'] = pd.to_numeric(good_capped['bodywt'], errors='coerce').clip(upper=100)

print('Max bodywt before cap:', float(pd.to_numeric(good['bodywt'], errors='coerce').max()))
print('Max bodywt after  cap:', float(good_capped['bodywt'].max()))
display(good_capped[['bodywt']].describe())

Max bodywt before cap: 600.0
Max bodywt after  cap: 100.0


Unnamed: 0,bodywt
count,20.0
mean,20.69265
std,39.138795
min,0.005
25%,0.0945
50%,0.749
75%,6.125
max,100.0


## 7) On filled *bad*, cut `bodywt` into 10 bins and return counts

In [7]:
filled_bad_bodywt = pd.to_numeric(filled_bad['bodywt'], errors='coerce')

bins_bad = pd.cut(filled_bad_bodywt, bins=10)
counts_bad = bins_bad.value_counts().sort_index()

display(counts_bad)

bodywt
(-0.177, 18.709]      10
(18.709, 37.408]       3
(37.408, 56.107]       2
(56.107, 74.806]       2
(74.806, 93.505]       1
(93.505, 112.204]      0
(112.204, 130.903]     0
(130.903, 149.602]     0
(149.602, 168.301]     0
(168.301, 187.0]       1
Name: count, dtype: int64

## Summary

In [8]:
summary = pd.DataFrame({
    'dataset': ['good', 'bad', 'ugly', 'filled_bad'],
    'rows': [len(good), len(bad), len(ugly), len(filled_bad)],
    'total_NaNs': [int(good.isna().sum().sum()), int(bad.isna().sum().sum()), int(ugly.isna().sum().sum()), int(filled_bad.isna().sum().sum())]
})
display(summary)

Unnamed: 0,dataset,rows,total_NaNs
0,good,20,0
1,bad,19,19
2,ugly,44,117
3,filled_bad,19,0
