These examples cover some of the most common data cleaning tasks in Pandas. Remember to adjust the code according to your specific dataset and requirements

# Handling Missing Values

In [None]:
import pandas as pd

# Create a sample dataframe
df = pd.DataFrame({'A': [1, 2, None, 4], 'B': [5, None, 7, 8]})

# Fill missing values with a specific value
df['A'].fillna(0, inplace=True)

# Fill missing values with the mean of the column
df['B'].fillna(df['B'].mean(), inplace=True)

# Drop rows with any missing values
df.dropna(inplace=True)


# Removing Duplicates

In [None]:
# Create a dataframe with duplicates
df = pd.DataFrame({'A': [1, 2, 2, 3], 'B': [4, 5, 5, 6]})

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Remove duplicates based on specific columns
df.drop_duplicates(subset=['A'], keep='first', inplace=True)


# Handling Outliers

In [None]:
import numpy as np

# Create a dataframe with outliers
df = pd.DataFrame({'A': [1, 2, 3, 100, 4, 5]})

# Identify outliers using Z-score
z_scores = np.abs((df['A'] - df['A'].mean()) / df['A'].std())
df = df[(z_scores < 3)]

# Or using IQR method
Q1 = df['A'].quantile(0.25)
Q3 = df['A'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['A'] < (Q1 - 1.5 * IQR)) | (df['A'] > (Q3 + 1.5 * IQR)))]


#  Type Conversion

In [None]:
# Create a dataframe with mixed types
df = pd.DataFrame({'A': ['1', '2', '3'], 'B': ['2023-01-01', '2023-01-02', '2023-01-03']})

# Convert string to integer
df['A'] = df['A'].astype(int)

# Convert string to datetime
df['B'] = pd.to_datetime(df['B'])


# String Cleaning

In [None]:
# Create a dataframe with messy strings
df = pd.DataFrame({'A': [' John ', 'JANE', 'bob']})

# Strip whitespace and convert to title case
df['A'] = df['A'].str.strip().str.title()


# Renaming Columns

In [None]:
# Create a dataframe with unclear column names
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

# Rename columns
df.rename(columns={'A': 'ID', 'B': 'Value'}, inplace=True)
