In [24]:
import pandas as pd

# --- Sample data ---
df = pd.DataFrame({
    'Name': [' Alice ', 'Bob', 'Charlie', None],
    'Age': [25, None, 30, 22],
    'Date': ['2024-01-05', '01/06/2024', None, '2024-02-07'],
    'Score': [90, 90, 85, 85]
})
print("Original DataFrame:")
print(df, "\n")
print("Shape:", df.shape)

Original DataFrame:
      Name   Age        Date  Score
0   Alice   25.0  2024-01-05     90
1      Bob   NaN  01/06/2024     90
2  Charlie  30.0        None     85
3     None  22.0  2024-02-07     85 

Shape: (4, 4)


In [None]:

# --- Renaming a column ---

df1 = df
df1.rename(columns={"Score":"Marks"}, inplace = True)
df1

Unnamed: 0,Nama,Age,Date,Marks
0,Alice,25.0,2024-01-05,90
1,Bob,,01/06/2024,90
2,Charlie,30.0,,85
3,,22.0,2024-02-07,85


In [11]:
# --- 1. Handle missing data ---
print("Missing values per column:")
print(df.isna().sum(), "\n")

Missing values per column:
Name     1
Age      1
Date     1
Score    0
dtype: int64 



In [12]:
# Drop rows with any missing values
df = df.dropna(subset=['Name'])
# Fill missing Age with mean
df['Age'] = df['Age'].fillna(df['Age'].mean())
print("Shape:", df.shape)

Shape: (3, 4)


In [13]:
# --- 2. Remove duplicates ---
df = df.drop_duplicates()
print("Shape:", df.shape)

Shape: (3, 4)


In [14]:
# --- 3. Filter rows / columns ---
# Keep only rows with Age >= 25
df = df[df['Age'] >= 25]
# Keep only selected columns
df = df[['Name', 'Age', 'Date', 'Score']]
df

Unnamed: 0,Name,Age,Date,Score
0,Alice,25.0,2024-01-05,90
1,Bob,27.5,01/06/2024,90
2,Charlie,30.0,,85


In [15]:
# --- 4. Rename columns ---
df = df.rename(columns={'Name': 'FullName', 'Date': 'JoinDate'})
df

Unnamed: 0,FullName,Age,JoinDate,Score
0,Alice,25.0,2024-01-05,90
1,Bob,27.5,01/06/2024,90
2,Charlie,30.0,,85


In [16]:
# --- 5. Change data types ---
df['Age'] = df['Age'].astype(int)
df

Unnamed: 0,FullName,Age,JoinDate,Score
0,Alice,25,2024-01-05,90
1,Bob,27,01/06/2024,90
2,Charlie,30,,85


In [17]:
# --- 6. String cleaning ---
df['FullName'] = df['FullName'].str.strip()       # remove extra spaces
df['FullName'] = df['FullName'].str.lower()       # lowercase names
df['FullName'] = df['FullName'].str.replace('bob', 'bobby')  # replace text
df

Unnamed: 0,FullName,Age,JoinDate,Score
0,alice,25,2024-01-05,90
1,bobby,27,01/06/2024,90
2,charlie,30,,85


In [18]:
# --- 7. Date/time handling ---
df['JoinDate'] = pd.to_datetime(df['JoinDate'], errors='coerce')
df['Year'] = df['JoinDate'].dt.year
df

Unnamed: 0,FullName,Age,JoinDate,Score,Year
0,alice,25,2024-01-05,90,2024.0
1,bobby,27,NaT,90,
2,charlie,30,NaT,85,


In [19]:
# --- 8. Merge / join example ---
df2 = pd.DataFrame({
    'FullName': ['alice', 'bobby'],
    'Department': ['HR', 'IT']
})
merged = pd.merge(df, df2, on='FullName', how='left')

print("Cleaned and merged DataFrame:")
print(merged)

Cleaned and merged DataFrame:
  FullName  Age   JoinDate  Score    Year Department
0    alice   25 2024-01-05     90  2024.0         HR
1    bobby   27        NaT     90     NaN         IT
2  charlie   30        NaT     85     NaN        NaN
