In [None]:
import pandas as pd
data={'ID':[101,102,103,104,105,106,107],
      'Name':['Mike','eleven','Will','Lucas','Dustin','Max','Steve'],
      'Age':[23,22,23,None,24,None,28],
      "Salary":[5000,None,4500,None,4500,4600,7000]
      }
df=pd.DataFrame(data)
df_cleaned = df.dropna()
print(df_cleaned)

    ID    Name   Age  Salary
0  101    Mike  23.0  5000.0
2  103    Will  23.0  4500.0
4  105  Dustin  24.0  4500.0
6  107   Steve  28.0  7000.0


In [None]:
#impue missing values with the mean of the column
for column in df.columns:
    if df[column].dtype == 'float64' or df[column].dtype == 'int64':
        mean_value = df[column].mean()
        df[column].fillna(mean_value, inplace=True)
print(df)

    ID    Name   Age  Salary
0  101    Mike  23.0  5000.0
1  102  eleven  22.0  5120.0
2  103    Will  23.0  4500.0
3  104   Lucas  24.0  5120.0
4  105  Dustin  24.0  4500.0
5  106     Max  24.0  4600.0
6  107   Steve  28.0  7000.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(mean_value, inplace=True)


In [None]:
import pandas as pd
#create a column based flag
for column in df.columns:
    if df[column].isnull().any():
        df[f'{column}_Flag']=df[column].isna().apply(lambda x:'Missing' if x else 'Not Missing')
#create a row-based column
df['Missing_Flag']=df.isnull().any(axis=1).apply(lambda x:'Missing' if x else 'Not Missing')
print(df)

    ID    Name   Age  Salary Missing_Flag
0  101    Mike  23.0  5000.0  Not Missing
1  102  eleven  22.0  5120.0  Not Missing
2  103    Will  23.0  4500.0  Not Missing
3  104   Lucas  24.0  5120.0  Not Missing
4  105  Dustin  24.0  4500.0  Not Missing
5  106     Max  24.0  4600.0  Not Missing
6  107   Steve  28.0  7000.0  Not Missing


In [None]:
import pandas as pd

data = {'ID':[101,102,103,104,102,100,101],
        'Name':['Alice','Bob','Charlie','David','Bob','Alice','Alicia'],
        'Age':[25,29,29,33,29,27,25],
        'Salary':[55000,48000,55000,62000,48000,59000,55000],
        'City':['NY','LA','NY','SF','LA','LA','NY']}

df=pd.DataFrame(data)
print("Original Dataset")
print(df,'\n')

df_cleaned=df.drop_duplicates()
print("Duplicates Removed - Exact Match")
print(df_cleaned,'\n')

df_cleaned_id = df.drop_duplicates(subset=['ID'])
print("Duplicates Removed - Partial Match based on ID")
print(df_cleaned_id,'\n')


Original Dataset
    ID     Name  Age  Salary City
0  101    Alice   25   55000   NY
1  102      Bob   29   48000   LA
2  103  Charlie   29   55000   NY
3  104    David   33   62000   SF
4  102      Bob   29   48000   LA
5  100    Alice   27   59000   LA
6  101   Alicia   25   55000   NY 

Duplicates Removed - Exact Match
    ID     Name  Age  Salary City
0  101    Alice   25   55000   NY
1  102      Bob   29   48000   LA
2  103  Charlie   29   55000   NY
3  104    David   33   62000   SF
5  100    Alice   27   59000   LA
6  101   Alicia   25   55000   NY 

Duplicates Removed - Partial Match based on ID
    ID     Name  Age  Salary City
0  101    Alice   25   55000   NY
1  102      Bob   29   48000   LA
2  103  Charlie   29   55000   NY
3  104    David   33   62000   SF
5  100    Alice   27   59000   LA 



In [None]:
#partial Match
df_cleaned_id =df.drop_duplicates(subset=['ID'])
print("Duplicates Removed-Partial match based on ID")
print(df_cleaned_id,"\n")

#partial Match-based on specific columns only(Eg:'ID')
df_cleaned_name = df.drop_duplicates(subset=['Name'])
print("Duplicates Removed-Partial match based on Name")
print(df_cleaned_name,"\n")

#partial Match -Based on specific columns only(Eg:'ID' and 'Name')
df_cleaned_specific = df.drop_duplicates(subset=['ID','Name'])
print("Duplicates Removed-Partial match based on ID and Name")
print(df_cleaned_specific,"\n")

Duplicates Removed-Partial match based on ID
    ID     Name  Age  Salary City
0  101    Alice   25   55000   NY
1  102      Bob   29   48000   LA
2  103  Charlie   29   55000   NY
3  104    David   33   62000   SF
5  100    Alice   27   59000   LA 

Duplicates Removed-Partial match based on Name
    ID     Name  Age  Salary City
0  101    Alice   25   55000   NY
1  102      Bob   29   48000   LA
2  103  Charlie   29   55000   NY
3  104    David   33   62000   SF
6  101   Alicia   25   55000   NY 

Duplicates Removed-Partial match based on ID and Name
    ID     Name  Age  Salary City
0  101    Alice   25   55000   NY
1  102      Bob   29   48000   LA
2  103  Charlie   29   55000   NY
3  104    David   33   62000   SF
5  100    Alice   27   59000   LA
6  101   Alicia   25   55000   NY 

