# Topic 3 :Data Cleaning (Handling Missing & Duplicate Data)

In [1]:
import numpy as np
import pandas as pd

## Task 1

In [2]:
arr1=np.arange(9).reshape(3,3)
arr2=np.arange(9).reshape(3,3)
arr1+arr2

array([[ 0,  2,  4],
       [ 6,  8, 10],
       [12, 14, 16]])

## Task 2

In [4]:
df=pd.read_csv("./data/organizations-10000.csv")
df.isna().sum()

Index                  1
Organization Id        0
Name                   0
Website                0
Country                0
Description            0
Founded                0
Industry               0
Number of employees    0
dtype: int64

## Task 3

In [6]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, np.nan, 30, np.nan],
    'Score': [85, 90, np.nan, np.nan]
}

df = pd.DataFrame(data)
df= df.replace(np.nan, 0)
df

Unnamed: 0,Name,Age,Score
0,Alice,25.0,85.0
1,Bob,0.0,90.0
2,Charlie,30.0,0.0
3,David,0.0,0.0


## Task 4

In [9]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, np.nan, 30, np.nan, 22],
    'Score': [85, 90, np.nan, 88, np.nan]
}

df = pd.DataFrame(data)
print(df)
df=df.dropna(axis=0)
df

      Name   Age  Score
0    Alice  25.0   85.0
1      Bob   NaN   90.0
2  Charlie  30.0    NaN
3    David   NaN   88.0
4      Eva  22.0    NaN


Unnamed: 0,Name,Age,Score
0,Alice,25.0,85.0


## Task 5

In [13]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, np.nan, 30, np.nan, 22],
    'Score': [85, 90, np.nan, 88, np.nan]
}

df = pd.DataFrame(data)
print(df)

df_mean=df.copy()
df_mean["Age"]=df_mean["Age"].fillna(df_mean["Age"].mean())
df_mean["Score"]=df_mean["Score"].fillna(df_mean["Score"].mean())
df_mean

      Name   Age  Score
0    Alice  25.0   85.0
1      Bob   NaN   90.0
2  Charlie  30.0    NaN
3    David   NaN   88.0
4      Eva  22.0    NaN


Unnamed: 0,Name,Age,Score
0,Alice,25.0,85.0
1,Bob,25.666667,90.0
2,Charlie,30.0,87.666667
3,David,25.666667,88.0
4,Eva,22.0,87.666667


## TAsk 6

In [14]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Eva', 'Bob'],
    'Age': [25, 30, 35, 25, 22, 30],
    'Score': [85, 90, 88, 85, 95, 90]
}

df = pd.DataFrame(data)
print(df)
df.drop_duplicates(keep="first", inplace=True)
df

      Name  Age  Score
0    Alice   25     85
1      Bob   30     90
2  Charlie   35     88
3    Alice   25     85
4      Eva   22     95
5      Bob   30     90


Unnamed: 0,Name,Age,Score
0,Alice,25,85
1,Bob,30,90
2,Charlie,35,88
4,Eva,22,95


## Task 7

In [17]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Eva', 'Bob'],
    'Age': [25, 30, 35, 25, 22, 30],
    'Score': [88, 90, 88, 85, 95, 90]
}

df = pd.DataFrame(data)
print(df)
df.drop_duplicates(keep="first", inplace=True, subset=["Name", "Age"])
df

      Name  Age  Score
0    Alice   25     88
1      Bob   30     90
2  Charlie   35     88
3    Alice   25     85
4      Eva   22     95
5      Bob   30     90


Unnamed: 0,Name,Age,Score
0,Alice,25,88
1,Bob,30,90
2,Charlie,35,88
4,Eva,22,95


## Task 8

In [21]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': ['25', '30', '28'],         # Strings instead of integers
    'Score': ['85', '90', '88']        # Strings instead of integers
}

df = pd.DataFrame(data)
df.dtypes
df["Age"]=df["Age"].astype(np.int32)
df["Score"]=df["Score"].astype(np.float32)
df.dtypes

Name      object
Age        int32
Score    float32
dtype: object

## Task 9

In [31]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank'],
    'Department': ['HR', 'IT', "IT", 'IT', np.nan, 'HR']
}

df = pd.DataFrame(data)
df["Department"].mode()[0]
# df["Department"]=df["Department"].replace(np.nan, df["Department"].mode()[0])
df["Department"]=np.where(df["Department"].isna(), df["Department"].mode()[0], df["Department"])
df

Unnamed: 0,Name,Department
0,Alice,HR
1,Bob,IT
2,Charlie,IT
3,David,IT
4,Eva,IT
5,Frank,HR


## Task 10

In [35]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank'],
    'Department': [np.nan, 'IT', "IT", 'IT', np.nan, np.nan]
}

df = pd.DataFrame(data)
df
df.ffill(inplace=True)
df
df.bfill(inplace=True)
df

Unnamed: 0,Name,Department
0,Alice,IT
1,Bob,IT
2,Charlie,IT
3,David,IT
4,Eva,IT
5,Frank,IT


## Task 11

In [39]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank'],
    'Score': [85, 87, 88, 90, 95, 300]  # 300 is an outlier
}

df = pd.DataFrame(data)

q1=df["Score"].quantile(0.25)
print(q1)
q3=df["Score"].quantile(0.75)
print(q3)
iqr=q3-q1
lower_bound=q1-(iqr*1.5)
upper_bound=q3+iqr*1.5
print(lower_bound, upper_bound)
median=df["Score"].median()
print(median)
df["Score"] = np.where((df['Score']<lower_bound) | (df["Score"]>upper_bound), median, df["Score"])
df['Score']

87.25
93.75
77.5 103.5
89.0


0    85.0
1    87.0
2    88.0
3    90.0
4    95.0
5    89.0
Name: Score, dtype: float64

## Task 12

In [43]:
data = {
    'Name': [' Alice ', 'BOB', 'charLie ', ' DaVid', 'eva'],
    'Department': [' HR', 'it ', ' IT', 'hr ', '  It']
}

df = pd.DataFrame(data)
print(df)
df["Name"]=df["Name"].str.strip().str.lower()
df["Department"]=df["Department"].str.strip().str.upper()
df

       Name Department
0    Alice          HR
1       BOB        it 
2  charLie          IT
3     DaVid        hr 
4       eva         It


Unnamed: 0,Name,Department
0,alice,HR
1,bob,IT
2,charlie,IT
3,david,HR
4,eva,IT


## Task 13

In [64]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, np.nan, 30, np.nan, 28],
    'Score': [85, 90, np.nan, np.nan, np.nan],
    'Remarks': [np.nan, np.nan, np.nan, np.nan, np.nan]  # All missing
}

df = pd.DataFrame(data)
print(df)
def clean_null(df, acceptance=0.5):
    total_data_acc=df.shape[0]*acceptance
    # total_null=np.array(list(df.isna().sum()))
    total_null=df.isna().sum()
    col_to_drop=total_null[(total_null>total_data_acc)].index
    df.drop(columns=col_to_drop, inplace=True)
    return df

clean_null(df)

      Name   Age  Score  Remarks
0    Alice  25.0   85.0      NaN
1      Bob   NaN   90.0      NaN
2  Charlie  30.0    NaN      NaN
3    David   NaN    NaN      NaN
4      Eva  28.0    NaN      NaN


Unnamed: 0,Name,Age
0,Alice,25.0
1,Bob,
2,Charlie,30.0
3,David,
4,Eva,28.0


## Task 14

In [67]:
data = {
    'Name': ['           Alice           ', '   Bob', ' Charlie', 'David ', ' Eva '],
    'Department': [' HR ', 'IT', ' IT', ' HR', 'IT ']
}

df = pd.DataFrame(data)
print(df)
df["Name"]=df["Name"].str.strip()
df["Department"]=df["Department"].str.strip()
df

                          Name Department
0             Alice                   HR 
1                          Bob         IT
2                      Charlie         IT
3                       David          HR
4                         Eva         IT 


Unnamed: 0,Name,Department
0,Alice,HR
1,Bob,IT
2,Charlie,IT
3,David,HR
4,Eva,IT
