In [26]:
import pandas as pd

df=pd.read_csv("uncleaned_sample_data.csv")
df=pd.DataFrame(df)
print(df)

              Name   Age   Salary        Department
0      Alice Smith  28.0  60000.0       Engineering
1      Bob Johnson  75.0  65000.0         Marketing
2    Charlie Brown  45.0  90000.0   Human Resources
3     Diana Prince   NaN  65000.0           Finance
4     Edward Elric  27.0      NaN          Research
5              NaN  40.0  85000.0                IT
6  George O'Malley  38.0  70000.0                IT
7     hannah baker  30.0  62000.0  Customer Support
8      ian malcolm  50.0  95000.0             Legal
9    Jessica Jones  33.0  67000.0    administration


In [12]:
# checking the missing values in the dataset
print(df.isnull().sum())

Name          1
Age           1
Salary        1
Department    0
dtype: int64


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        9 non-null      object 
 1   Age         9 non-null      float64
 2   Salary      9 non-null      float64
 3   Department  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [14]:
# removing the rows which have missing values
df_cleaned=df.dropna()
print(df_cleaned)

              Name   Age   Salary        Department
0      Alice Smith  28.0  60000.0       Engineering
1      Bob Johnson  75.0  65000.0         Marketing
2    Charlie Brown  45.0  90000.0   Human Resources
6  George O'Malley  38.0  70000.0                IT
7     hannah baker  30.0  62000.0  Customer Support
8      ian malcolm  50.0  95000.0             Legal
9    Jessica Jones  33.0  67000.0    administration


In [15]:
# filling the missing values
df_filled=df.fillna({
    "Age":df["Age"].mean(),
    "Salary":df["Salary"].mean()
})
print(df_filled)

              Name        Age        Salary        Department
0      Alice Smith  28.000000  60000.000000       Engineering
1      Bob Johnson  75.000000  65000.000000         Marketing
2    Charlie Brown  45.000000  90000.000000   Human Resources
3     Diana Prince  40.666667  65000.000000           Finance
4     Edward Elric  27.000000  73222.222222          Research
5              NaN  40.000000  85000.000000                IT
6  George O'Malley  38.000000  70000.000000                IT
7     hannah baker  30.000000  62000.000000  Customer Support
8      ian malcolm  50.000000  95000.000000             Legal
9    Jessica Jones  33.000000  67000.000000    administration


In [19]:
# filling the missing values with forward fill
df_ffill=df.fillna(method="ffill")
print(df_ffill)

              Name   Age   Salary        Department
0      Alice Smith  28.0  60000.0       Engineering
1      Bob Johnson  75.0  65000.0         Marketing
2    Charlie Brown  45.0  90000.0   Human Resources
3     Diana Prince  45.0  65000.0           Finance
4     Edward Elric  27.0  65000.0          Research
5     Edward Elric  40.0  85000.0                IT
6  George O'Malley  38.0  70000.0                IT
7     hannah baker  30.0  62000.0  Customer Support
8      ian malcolm  50.0  95000.0             Legal
9    Jessica Jones  33.0  67000.0    administration


In [32]:
# filling the missing values with backward fill
df_bfill=df
df_bfill["Age"]=df_bfill["Age"].fillna(method="bfill")
print(df_bfill)

               Name   Age   Salary        Department
0       Alice Smith  28.0  60000.0       Engineering
1       Bob Johnson  75.0  65000.0         Marketing
2     Charlie Brown  45.0  90000.0   Human Resources
3      Diana Prince  27.0  65000.0           Finance
4      Edward Elric  27.0      NaN          Research
5               NaN  40.0  85000.0                IT
6   George O'Malley  38.0  70000.0                IT
7      hannah baker  30.0  62000.0  Customer Support
8       ian malcolm  50.0  95000.0             Legal
9     Jessica Jones  33.0  67000.0    administration
10      Alice Smith  28.0  60000.0       Engineering
11      Bob Johnson  75.0  65000.0         Marketing


In [27]:
# adding duplicate values to the dataset
df=pd.concat([df,df.iloc[[0]],df.iloc[[1]]],ignore_index=True)
print(df)
print("\n")
df_no_duplicate=df.drop_duplicates()
print(df_no_duplicate)

               Name   Age   Salary        Department
0       Alice Smith  28.0  60000.0       Engineering
1       Bob Johnson  75.0  65000.0         Marketing
2     Charlie Brown  45.0  90000.0   Human Resources
3      Diana Prince   NaN  65000.0           Finance
4      Edward Elric  27.0      NaN          Research
5               NaN  40.0  85000.0                IT
6   George O'Malley  38.0  70000.0                IT
7      hannah baker  30.0  62000.0  Customer Support
8       ian malcolm  50.0  95000.0             Legal
9     Jessica Jones  33.0  67000.0    administration
10      Alice Smith  28.0  60000.0       Engineering
11      Bob Johnson  75.0  65000.0         Marketing


              Name   Age   Salary        Department
0      Alice Smith  28.0  60000.0       Engineering
1      Bob Johnson  75.0  65000.0         Marketing
2    Charlie Brown  45.0  90000.0   Human Resources
3     Diana Prince   NaN  65000.0           Finance
4     Edward Elric  27.0      NaN          Resear

In [31]:
correct_df=df.replace({"Department":{"Human Resources":"HR","Research":"R&D"}})
print(correct_df)

               Name   Age   Salary        Department
0       Alice Smith  28.0  60000.0       Engineering
1       Bob Johnson  75.0  65000.0         Marketing
2     Charlie Brown  45.0  90000.0                HR
3      Diana Prince   NaN  65000.0           Finance
4      Edward Elric  27.0      NaN               R&D
5               NaN  40.0  85000.0                IT
6   George O'Malley  38.0  70000.0                IT
7      hannah baker  30.0  62000.0  Customer Support
8       ian malcolm  50.0  95000.0             Legal
9     Jessica Jones  33.0  67000.0    administration
10      Alice Smith  28.0  60000.0       Engineering
11      Bob Johnson  75.0  65000.0         Marketing


In [34]:
# Ensuring the consistency in the dataset
df["Department"]=df["Department"].str.lower()#title for capatalize initials
print(df)

               Name   Age   Salary        Department
0       Alice Smith  28.0  60000.0       engineering
1       Bob Johnson  75.0  65000.0         marketing
2     Charlie Brown  45.0  90000.0   human resources
3      Diana Prince  27.0  65000.0           finance
4      Edward Elric  27.0      NaN          research
5               NaN  40.0  85000.0                it
6   George O'Malley  38.0  70000.0                it
7      hannah baker  30.0  62000.0  customer support
8       ian malcolm  50.0  95000.0             legal
9     Jessica Jones  33.0  67000.0    administration
10      Alice Smith  28.0  60000.0       engineering
11      Bob Johnson  75.0  65000.0         marketing


In [35]:
df["Name"]=df["Name"].fillna("Unknown")
print(df)

               Name   Age   Salary        Department
0       Alice Smith  28.0  60000.0       engineering
1       Bob Johnson  75.0  65000.0         marketing
2     Charlie Brown  45.0  90000.0   human resources
3      Diana Prince  27.0  65000.0           finance
4      Edward Elric  27.0      NaN          research
5           Unknown  40.0  85000.0                it
6   George O'Malley  38.0  70000.0                it
7      hannah baker  30.0  62000.0  customer support
8       ian malcolm  50.0  95000.0             legal
9     Jessica Jones  33.0  67000.0    administration
10      Alice Smith  28.0  60000.0       engineering
11      Bob Johnson  75.0  65000.0         marketing


In [36]:
# applying in min-max normalization
df_normalized=df.copy()
for col in ["Age","Salary"]:
  df_normalized[col]=(df_normalized[col]-df_normalized[col].min())/(df_normalized[col].max()-df_normalized[col].min())
print(df_normalized)

               Name       Age    Salary        Department
0       Alice Smith  0.020833  0.000000       engineering
1       Bob Johnson  1.000000  0.142857         marketing
2     Charlie Brown  0.375000  0.857143   human resources
3      Diana Prince  0.000000  0.142857           finance
4      Edward Elric  0.000000       NaN          research
5           Unknown  0.270833  0.714286                it
6   George O'Malley  0.229167  0.285714                it
7      hannah baker  0.062500  0.057143  customer support
8       ian malcolm  0.479167  1.000000             legal
9     Jessica Jones  0.125000  0.200000    administration
10      Alice Smith  0.020833  0.000000       engineering
11      Bob Johnson  1.000000  0.142857         marketing
