In [1]:
import numpy as np
import pandas as pd

In [15]:
data = {
    'FamilyID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'ParentName': ['Smith', 'Johnson', 'Williams', None, 'Brown', 'Davis', 'Miller', 'Wilson', 'Moore', 'Taylor'],
    'NumChildren': [2, np.nan, 1, 4, 2, 0, np.nan, 2, 1, 5],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', None, 'Philadelphia', 'San Antonio', 'San Diego', 'Dallas', 'San Jose']
}
df=pd.DataFrame(data)
df

Unnamed: 0,FamilyID,ParentName,NumChildren,City
0,1,Smith,2.0,New York
1,2,Johnson,,Los Angeles
2,3,Williams,1.0,Chicago
3,4,,4.0,Houston
4,5,Brown,2.0,
5,6,Davis,0.0,Philadelphia
6,7,Miller,,San Antonio
7,8,Wilson,2.0,San Diego
8,9,Moore,1.0,Dallas
9,10,Taylor,5.0,San Jose


#### Identifying missing values in dataframe


In [16]:
missing_values=df.isnull()
missing_values


Unnamed: 0,FamilyID,ParentName,NumChildren,City
0,False,False,False,False
1,False,False,True,False
2,False,False,False,False
3,False,True,False,False
4,False,False,False,True
5,False,False,False,False
6,False,False,True,False
7,False,False,False,False
8,False,False,False,False
9,False,False,False,False


In [17]:
##calculating how many
missing_values.sum()

FamilyID       0
ParentName     1
NumChildren    2
City           1
dtype: int64

In [19]:
#check if there is any

df.isnull().values.any()

True

#### Drop rows with any missing values.

In [20]:
df_dropped=df.dropna()
df_dropped

Unnamed: 0,FamilyID,ParentName,NumChildren,City
0,1,Smith,2.0,New York
2,3,Williams,1.0,Chicago
5,6,Davis,0.0,Philadelphia
7,8,Wilson,2.0,San Diego
8,9,Moore,1.0,Dallas
9,10,Taylor,5.0,San Jose


#### Drop columns with any missing values.

In [21]:
df_cleaned= df.dropna(axis=1)
df_cleaned

Unnamed: 0,FamilyID
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8
8,9
9,10


#### Fill missing values with a specific value..


In [22]:
filled_value='Unknown'
df_filled=df.fillna(filled_value)
df_filled

Unnamed: 0,FamilyID,ParentName,NumChildren,City
0,1,Smith,2.0,New York
1,2,Johnson,Unknown,Los Angeles
2,3,Williams,1.0,Chicago
3,4,Unknown,4.0,Houston
4,5,Brown,2.0,Unknown
5,6,Davis,0.0,Philadelphia
6,7,Miller,Unknown,San Antonio
7,8,Wilson,2.0,San Diego
8,9,Moore,1.0,Dallas
9,10,Taylor,5.0,San Jose


#### Fill missing values using forward fill and backward fill methods.

In [23]:
#Forward fill
df.ffill()

Unnamed: 0,FamilyID,ParentName,NumChildren,City
0,1,Smith,2.0,New York
1,2,Johnson,2.0,Los Angeles
2,3,Williams,1.0,Chicago
3,4,Williams,4.0,Houston
4,5,Brown,2.0,Houston
5,6,Davis,0.0,Philadelphia
6,7,Miller,0.0,San Antonio
7,8,Wilson,2.0,San Diego
8,9,Moore,1.0,Dallas
9,10,Taylor,5.0,San Jose


In [25]:
#Backward fill
df.bfill()

Unnamed: 0,FamilyID,ParentName,NumChildren,City
0,1,Smith,2.0,New York
1,2,Johnson,1.0,Los Angeles
2,3,Williams,1.0,Chicago
3,4,Brown,4.0,Houston
4,5,Brown,2.0,Philadelphia
5,6,Davis,0.0,Philadelphia
6,7,Miller,2.0,San Antonio
7,8,Wilson,2.0,San Diego
8,9,Moore,1.0,Dallas
9,10,Taylor,5.0,San Jose


#### Interpolate missing values.

In [26]:
df_interpolated = df.copy()
df_interpolated['NumChildren'] = df_interpolated['NumChildren'].interpolate()
df_interpolated

Unnamed: 0,FamilyID,ParentName,NumChildren,City
0,1,Smith,2.0,New York
1,2,Johnson,1.5,Los Angeles
2,3,Williams,1.0,Chicago
3,4,,4.0,Houston
4,5,Brown,2.0,
5,6,Davis,0.0,Philadelphia
6,7,Miller,1.0,San Antonio
7,8,Wilson,2.0,San Diego
8,9,Moore,1.0,Dallas
9,10,Taylor,5.0,San Jose


####  Convert a column to a different data type.

In [28]:
df['NumChildren'] = df['NumChildren'].fillna(0).astype(int)
df


Unnamed: 0,FamilyID,ParentName,NumChildren,City
0,1,Smith,2,New York
1,2,Johnson,0,Los Angeles
2,3,Williams,1,Chicago
3,4,,4,Houston
4,5,Brown,2,
5,6,Davis,0,Philadelphia
6,7,Miller,0,San Antonio
7,8,Wilson,2,San Diego
8,9,Moore,1,Dallas
9,10,Taylor,5,San Jose


#### Apply a function to transform the values of a column.

In [30]:
# Define a function to double the values
def double_value(x):
    return 2 * x
df['NumChildren'] = df['NumChildren'].apply(double_value)
df

Unnamed: 0,FamilyID,ParentName,NumChildren,City
0,1,Smith,8,New York
1,2,Johnson,0,Los Angeles
2,3,Williams,4,Chicago
3,4,,16,Houston
4,5,Brown,8,
5,6,Davis,0,Philadelphia
6,7,Miller,0,San Antonio
7,8,Wilson,8,San Diego
8,9,Moore,4,Dallas
9,10,Taylor,20,San Jose


#### Normalize a column using Min-Max scaling.

In [31]:
def min_max_scaling(x):
    return (x - np.min(x)) / (np.max(x) - np.min(x))
df['NumChildren_scaled'] = min_max_scaling(df['NumChildren'])
df

Unnamed: 0,FamilyID,ParentName,NumChildren,City,NumChildren_scaled
0,1,Smith,8,New York,0.4
1,2,Johnson,0,Los Angeles,0.0
2,3,Williams,4,Chicago,0.2
3,4,,16,Houston,0.8
4,5,Brown,8,,0.4
5,6,Davis,0,Philadelphia,0.0
6,7,Miller,0,San Antonio,0.0
7,8,Wilson,8,San Diego,0.4
8,9,Moore,4,Dallas,0.2
9,10,Taylor,20,San Jose,1.0


#### 
Standardize a column (z-score normalization).
).


In [32]:
def z_score_normalization(x):
    return (x - np.mean(x)) / np.std(x)
df['NumChildren_standardized'] = z_score_normalization(df['NumChildren'])
df


Unnamed: 0,FamilyID,ParentName,NumChildren,City,NumChildren_scaled,NumChildren_standardized
0,1,Smith,8,New York,0.4,0.185695
1,2,Johnson,0,Los Angeles,0.0,-1.052274
2,3,Williams,4,Chicago,0.2,-0.433289
3,4,,16,Houston,0.8,1.423664
4,5,Brown,8,,0.4,0.185695
5,6,Davis,0,Philadelphia,0.0,-1.052274
6,7,Miller,0,San Antonio,0.0,-1.052274
7,8,Wilson,8,San Diego,0.4,0.185695
8,9,Moore,4,Dallas,0.2,-0.433289
9,10,Taylor,20,San Jose,1.0,2.042649


#### Identify duplicate rows in the DataFrame.

In [34]:
data = {
    'FamilyID': [1, 2, 3, 4, 5, 2],
    'ParentName': ['Smith', 'Johnson', 'Williams', 'Brown', 'Davis', 'Johnson'],
    'NumChildren': [2, 3, 1, 4, 2, 3],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Los Angeles']
}

df2 = pd.DataFrame(data)

duplicate_rows = df2[df2.duplicated()]
duplicate_rows

Unnamed: 0,FamilyID,ParentName,NumChildren,City
5,2,Johnson,3,Los Angeles


#### Drop duplicate rows

In [38]:
duplicates = df2.drop_duplicates()
duplicates


Unnamed: 0,FamilyID,ParentName,NumChildren,City
0,1,Smith,2,New York
1,2,Johnson,3,Los Angeles
2,3,Williams,1,Chicago
3,4,Brown,4,Houston
4,5,Davis,2,Phoenix
5,2,Johnson,3,Los Angeles


In [41]:
df2

Unnamed: 0,FamilyID,ParentName,NumChildren,City
0,1,Smith,2,New York
1,2,Johnson,3,Los Angeles
2,3,Williams,1,Chicago
3,4,Brown,4,Houston
4,5,Davis,2,Phoenix
5,2,Johnson,3,Los Angeles


#### Drop duplicate rows based on specific columns.

In [42]:
duplicates = df2.drop_duplicates(subset=['FamilyID', 'ParentName'])
duplicates


Unnamed: 0,FamilyID,ParentName,NumChildren,City
0,1,Smith,2,New York
1,2,Johnson,3,Los Angeles
2,3,Williams,1,Chicago
3,4,Brown,4,Houston
4,5,Davis,2,Phoenix


#### Convert all string values in a column to lowercase.

In [43]:
df2['ParentName'] = df2['ParentName'].str.lower()
df2

Unnamed: 0,FamilyID,ParentName,NumChildren,City
0,1,smith,2,New York
1,2,johnson,3,Los Angeles
2,3,williams,1,Chicago
3,4,brown,4,Houston
4,5,davis,2,Phoenix
5,2,johnson,3,Los Angeles


### Remove leading and trailing spaces from string values in a column.

In [81]:
#dataset with trailing values
data = {
    'FamilyID': [1, 2, 3, 4, 5],
    'ParentName': [' Smith ', 'Johnson  ', ' Williams', ' Brown', 'Davis   '],
    'NumChildren': [2, 3, 1, 4, 2],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'],
      'Birthdate': ['2023-05-15', '2022-06-10', '2021-11-16', '2024-01-01', '2025-08-20']
}
df3=pd.DataFrame(data)
print("Data before removing trailing")
print(df3)


Data before removing trailing
   FamilyID ParentName  NumChildren         City   Birthdate
0         1     Smith             2     New York  2023-05-15
1         2  Johnson              3  Los Angeles  2022-06-10
2         3   Williams            1      Chicago  2021-11-16
3         4      Brown            4      Houston  2024-01-01
4         5   Davis               2      Phoenix  2025-08-20


In [82]:
print("Data After removing:")
df3['ParentName'] = df3['ParentName'].str.strip()
print(df3)

Data After removing:
   FamilyID ParentName  NumChildren         City   Birthdate
0         1      Smith            2     New York  2023-05-15
1         2    Johnson            3  Los Angeles  2022-06-10
2         3   Williams            1      Chicago  2021-11-16
3         4      Brown            4      Houston  2024-01-01
4         5      Davis            2      Phoenix  2025-08-20


#### Replace a specific substring in a column with another substring.

In [83]:
df3['ParentName'] = df3['ParentName'].fillna('')
df3['ParentName'] = df3['ParentName'].str.replace('son', 'child')
df3

Unnamed: 0,FamilyID,ParentName,NumChildren,City,Birthdate
0,1,Smith,2,New York,2023-05-15
1,2,Johnchild,3,Los Angeles,2022-06-10
2,3,Williams,1,Chicago,2021-11-16
3,4,Brown,4,Houston,2024-01-01
4,5,Davis,2,Phoenix,2025-08-20


#### Extract a substring from each value in a column.

In [62]:

df3['FirstThreeChars'] = df3['ParentName'].str[:3]
df3

Unnamed: 0,FamilyID,ParentName,NumChildren,City,FirstThreeChars
0,1,Smith,2,New York,Smi
1,2,Johnson,3,Los Angeles,Joh
2,3,Williams,1,Chicago,Wil
3,4,Brown,4,Houston,Bro
4,5,Davis,2,Phoenix,Dav


#### Convert a column to datetime format.

In [84]:
df3['Birthdate'] = pd.to_datetime(df3['Birthdate'], errors='coerce')

df3

Unnamed: 0,FamilyID,ParentName,NumChildren,City,Birthdate
0,1,Smith,2,New York,2023-05-15
1,2,Johnchild,3,Los Angeles,2022-06-10
2,3,Williams,1,Chicago,2021-11-16
3,4,Brown,4,Houston,2024-01-01
4,5,Davis,2,Phoenix,2025-08-20


In [85]:
df3['Year'] = df3['Birthdate'].dt.year
df3['Month'] = df3['Birthdate'].dt.month
df3['Day'] = df3['Birthdate'].dt.day
df3

Unnamed: 0,FamilyID,ParentName,NumChildren,City,Birthdate,Year,Month,Day
0,1,Smith,2,New York,2023-05-15,2023,5,15
1,2,Johnchild,3,Los Angeles,2022-06-10,2022,6,10
2,3,Williams,1,Chicago,2021-11-16,2021,11,16
3,4,Brown,4,Houston,2024-01-01,2024,1,1
4,5,Davis,2,Phoenix,2025-08-20,2025,8,20


#### Filter rows based on a date range.

In [None]:
start_date = pd.Timestamp('2023-01-01')
end_date = pd.Timestamp('2023-12-31')
filtered_df = df3[(df3['birthdate'] >= start_date) & (df3['birthdate'] <= end_date)]