In [7]:
import pandas as pd

In [8]:
data = {
    'Name': ['Ali', 'Sara', 'John', None, 'Mary'],
    'Age': [25, None, 30, 22, None],
    'City': ['KL', 'Penang', None, 'Johor', 'KL']
}

In [11]:
df=pd.DataFrame(data)
print('Original DataFrame:')
print(df)

Original DataFrame:
   Name   Age    City
0   Ali  25.0      KL
1  Sara   NaN  Penang
2  John  30.0    None
3  None  22.0   Johor
4  Mary   NaN      KL


## check the missing values Row by Row

In [12]:
for index, row in df.iterrows(): 
    print(f"Row {index}: Missing = {row.isnull().any()}, Details = {row.isnull().to_dict()}")

Row 0: Missing = False, Details = {'Name': False, 'Age': False, 'City': False}
Row 1: Missing = True, Details = {'Name': False, 'Age': True, 'City': False}
Row 2: Missing = True, Details = {'Name': False, 'Age': False, 'City': True}
Row 3: Missing = True, Details = {'Name': True, 'Age': False, 'City': False}
Row 4: Missing = True, Details = {'Name': False, 'Age': True, 'City': False}


In [13]:
# Show only rows with missing data
missing_rows = df[df.isnull().any(axis=1)]
print('Rows with missing data')
print(missing_rows)

Rows with missing data
   Name   Age    City
1  Sara   NaN  Penang
2  John  30.0    None
3  None  22.0   Johor
4  Mary   NaN      KL


In [15]:
#simulate rows that would be dropped using dropna()
to_drop = df[df.isnull().any(axis=1)]
print('These rows would be dropped using dropna():')
print(to_drop)

These rows would be dropped using dropna():
   Name   Age    City
1  Sara   NaN  Penang
2  John  30.0    None
3  None  22.0   Johor
4  Mary   NaN      KL


In [16]:
df_cleaned = df.dropna()
print('\nAfter dropna():')
print(df_cleaned)


After dropna():
  Name   Age City
0  Ali  25.0   KL


In [17]:
print('Before fillna():')
print(df)

Before fillna():
   Name   Age    City
0   Ali  25.0      KL
1  Sara   NaN  Penang
2  John  30.0    None
3  None  22.0   Johor
4  Mary   NaN      KL


In [20]:
df_filled = df.fillna({
    'Name': 'Unknown',
    'Age': df['Age'].mean(),
    'City': 'Not available'
})


In [21]:
print('\nAfter fillna():')
print(df_filled)


After fillna():
      Name        Age           City
0      Ali  25.000000             KL
1     Sara  25.666667         Penang
2     John  30.000000  Not available
3  Unknown  22.000000          Johor
4     Mary  25.666667             KL


In [23]:
df_sfill = df.fillna({
    'Name': 'Unknown',
    'Age': round(df['Age'].mean(), 2),
    'City': 'Not available'
})


In [24]:
print('\nAfter fillna(): age upto to decimal digit')
print(df_sfill)


After fillna(): age upto to decimal digit
      Name    Age           City
0      Ali  25.00             KL
1     Sara  25.67         Penang
2     John  30.00  Not available
3  Unknown  22.00          Johor
4     Mary  25.67             KL


In [25]:
import math

In [29]:
df_new = df.fillna({
    'Name': 'Unknown',
    'Age': math.floor(df['Age'].mean()),   # floor() rounds down the average
    'City': 'Not available'
})


In [30]:
print('\nAfter fillna(): floor without decimal')
print(df_new)


After fillna(): floor without decimal
      Name   Age           City
0      Ali  25.0             KL
1     Sara  25.0         Penang
2     John  30.0  Not available
3  Unknown  22.0          Johor
4     Mary  25.0             KL


## Data Cleaning Exercise
## Step 1:Use day 15 students_performance_dirty.csv, downloaf from github (if don't have)
## Step 2: Get basic information about dataset
## Step 3: print('Missing values per column:')
## Step 4: Check missing data line by line
## Step 5: Drop missing rows (if any)
## Step 6: Compare before and after
## Step 7: Fill missing values
## Step 8: Compare before and after


In [32]:
import pandas as pd

df = pd.read_csv("C:/Users/Asus/Desktop/aimltraining/Day 15/students_performance_dirty.csv")


In [33]:
# Step 2: Get basic information about dataset
print("Basic Information:")
print(df.info())
print("\n")

Basic Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   gender          52 non-null     object 
 1   study_hours     52 non-null     float64
 2   attendance_pct  60 non-null     float64
 3   math_score      60 non-null     float64
 4   reading_score   60 non-null     float64
 5   final_score     60 non-null     float64
dtypes: float64(5), object(1)
memory usage: 2.9+ KB
None




In [34]:
# Step 3: Check missing values per column
print("Missing values per column:")
print(df.isnull().sum())
print("\n")

Missing values per column:
gender            8
study_hours       8
attendance_pct    0
math_score        0
reading_score     0
final_score       0
dtype: int64




In [35]:
# Step 4: Check missing data line by line
print("Rows with missing data:")
print(df[df.isnull().any(axis=1)])
print("\n")

Rows with missing data:
    gender  study_hours  attendance_pct  math_score  reading_score  \
1      NaN          2.6            91.2        82.0           60.0   
6     male          NaN            87.1        95.0           78.0   
7    femle          NaN            70.6        59.0           70.0   
11     NaN          4.6            86.2        63.0           78.0   
12   femle          NaN            95.0        59.0           92.0   
13    male          NaN            86.5        61.0           50.0   
20  female          NaN            93.5        65.0           54.0   
28  female          NaN            71.7        65.0           53.0   
33     NaN          NaN            96.0        39.0           96.0   
34     NaN          3.6            91.3        64.0           62.0   
43     NaN          3.9            76.5        78.0           73.0   
50     NaN          3.0            84.9        78.0           85.0   
52     NaN          3.9            91.1        60.0           68.0

In [36]:
# Step 5: Drop missing rows (if any)
df_cleaned = df.dropna()


In [37]:
# Step 6: Compare before and after
print("Before cleaning: ", df.shape)
print("After cleaning: ", df_cleaned.shape)

Before cleaning:  (60, 6)
After cleaning:  (45, 6)


In [39]:
#Step 7: Fill missing values (example approach)
df_fill = df.fillna({
    'gender': 'Unknown',
    'study_hours': df['study_hours'].mean()
})

In [40]:
# Step 8: Compare before and after filling
print("\n=== Missing values after fillna() ===")
print(df_fill.isnull().sum())

print("\n=== Before filling ===")
print(df.isnull().sum())

print("\n=== After filling ===")
print(df_fill.isnull().sum())


=== Missing values after fillna() ===
gender            0
study_hours       0
attendance_pct    0
math_score        0
reading_score     0
final_score       0
dtype: int64

=== Before filling ===
gender            8
study_hours       8
attendance_pct    0
math_score        0
reading_score     0
final_score       0
dtype: int64

=== After filling ===
gender            0
study_hours       0
attendance_pct    0
math_score        0
reading_score     0
final_score       0
dtype: int64


In [43]:
#step 8 compare before and after
# compare each row before and after fillna()
for i in range(len(df)):
    print(f"\nRow {i} Before: {df.iloc[i].to_dict()}")
    print(f"Row {i} After: {df_filled.iloc[i].to_dict()}")


Row 0 Before: {'gender': 'male', 'study_hours': 1.7, 'attendance_pct': 83.6, 'math_score': 62.0, 'reading_score': 91.0, 'final_score': 38.9}
Row 0 After: {'Name': 'Ali', 'Age': 25.0, 'City': 'KL'}

Row 1 Before: {'gender': nan, 'study_hours': 2.6, 'attendance_pct': 91.2, 'math_score': 82.0, 'reading_score': 60.0, 'final_score': 37.5}
Row 1 After: {'Name': 'Sara', 'Age': 25.666666666666668, 'City': 'Penang'}

Row 2 Before: {'gender': 'female', 'study_hours': 2.9, 'attendance_pct': 97.5, 'math_score': 69.0, 'reading_score': 57.0, 'final_score': 35.0}
Row 2 After: {'Name': 'John', 'Age': 30.0, 'City': 'Not available'}

Row 3 Before: {'gender': 'female', 'study_hours': 4.8, 'attendance_pct': 85.7, 'math_score': 78.0, 'reading_score': 62.0, 'final_score': 36.5}
Row 3 After: {'Name': 'Unknown', 'Age': 22.0, 'City': 'Johor'}

Row 4 Before: {'gender': 'male', 'study_hours': 3.9, 'attendance_pct': -10.0, 'math_score': 64.0, 'reading_score': 95.0, 'final_score': 30.9}
Row 4 After: {'Name': 'Mar

IndexError: single positional indexer is out-of-bounds