In [1]:
#handling missing values

import pandas as pd
import numpy as np


data = {
    'Numerical_Column1': [1.2, 2.3, np.nan, 4.5, 5.6],
    'Numerical_Column2': [np.nan, 3.4, 2.2, np.nan, 5.1],
    'Categorical_Column1': ['A', np.nan, 'B', 'A', 'B'],
    'Categorical_Column2': [np.nan, 'X', 'X', 'Y', 'Y']
}

df = pd.DataFrame(data)

# Filling missing values in numerical columns with the mean
numerical_columns = df.select_dtypes(include=['number']).columns
for col in numerical_columns:
    df[col].fillna(df[col].mean(), inplace=True)

# Filling missing values in categorical columns with the most frequent value
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

print(df)


   Numerical_Column1  Numerical_Column2 Categorical_Column1  \
0                1.2           3.566667                   A   
1                2.3           3.400000                   A   
2                3.4           2.200000                   B   
3                4.5           3.566667                   A   
4                5.6           5.100000                   B   

  Categorical_Column2  
0                   X  
1                   X  
2                   X  
3                   Y  
4                   Y  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [2]:
#removing duplicates

import pandas as pd

# Example dataset with duplicate rows
data = {
    'Column1': [1, 2, 3, 4, 2],
    'Column2': ['A', 'B', 'C', 'D', 'B'],
    'Column3': [10, 20, 30, 40, 20]
}

df = pd.DataFrame(data)

print("Original Dataset:")
print(df)


duplicates = df[df.duplicated()] #identify duplicate rows
print("\nDuplicate Rows:")
print(duplicates)


df_no_duplicates = df.drop_duplicates() #removing duplicate rows

print("\nDataset After Removing Duplicates:")
print(df_no_duplicates)


Original Dataset:
   Column1 Column2  Column3
0        1       A       10
1        2       B       20
2        3       C       30
3        4       D       40
4        2       B       20

Duplicate Rows:
   Column1 Column2  Column3
4        2       B       20

Dataset After Removing Duplicates:
   Column1 Column2  Column3
0        1       A       10
1        2       B       20
2        3       C       30
3        4       D       40


In [3]:
#handling outliersimport pandas as pd

# Example dataset with incorrect data types
data = {
    'Age': ['25', '30', '22', '28', '35'],  # Stored as strings
    'Salary': ['50000', '55000.50', '48000', '52000', '60000'],  # Stored as strings (with some floats)
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],  # Correct type (string)
    'Join_Date': ['2023-01-15', '2022-12-10', '2020-05-20', '2019-03-11', '2021-07-25'],  # Stored as strings
}

df = pd.DataFrame(data)

print("Original Dataset:")
print(df.info())

# Function to convert columns to appropriate data types
def convert_column_dtypes(df):
    for column in df.columns:
        # Try converting to numeric (int or float)
        try:
            df[column] = pd.to_numeric(df[column], errors='coerce')  # Convert to numeric, invalid entries become NaN
        except ValueError:
            pass

        # Try converting to datetime for columns with date-like strings
        if df[column].dtype == 'object':  # Check if column is still of type object
            try:
                df[column] = pd.to_datetime(df[column], errors='coerce')  # Convert to datetime
            except ValueError:
                pass

    return df

# Apply the function to the dataset
df = convert_column_dtypes(df)

print("\nDataset After Converting Data Types:")
print(df.info())
print(df)




Original Dataset:
      Name  Age   Salary
0    Alice   25    50000
1      Bob   30    55000
2  Charlie   22    48000
3    David   28    52000
4      Eva  120  2000000

Dataset After Removing Outliers:
      Name  Age   Salary
0    Alice   25    50000
1      Bob   30    55000
2  Charlie   22    48000
3    David   28    52000
4      Eva  120  2000000


In [4]:
import pandas as pd

# Example dataset with incorrect data types
data = {
    'Age': ['25', '30', '22', '28', '35'],  # Stored as strings
    'Salary': ['50000', '55000.50', '48000', '52000', '60000'],  # Stored as strings (with some floats)
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],  # Correct type (string)
    'Join_Date': ['2023-01-15', '2022-12-10', '2020-05-20', '2019-03-11', '2021-07-25'],  # Stored as strings
}

df = pd.DataFrame(data)

print("Original Dataset:")
print(df.info())

# Function to convert columns to appropriate data types
def convert_column_dtypes(df):
    for column in df.columns:
        # Try converting to numeric (int or float)
        try:
            df[column] = pd.to_numeric(df[column], errors='coerce')  # Convert to numeric, invalid entries become NaN
        except ValueError:
            pass

        # Try converting to datetime for columns with date-like strings
        if df[column].dtype == 'object':  # Check if column is still of type object
            try:
                df[column] = pd.to_datetime(df[column], errors='coerce')  # Convert to datetime
            except ValueError:
                pass

    return df

# Apply the function to the dataset
df = convert_column_dtypes(df)

print("\nDataset After Converting Data Types:")
print(df.info())
print(df)


Original Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Age        5 non-null      object
 1   Salary     5 non-null      object
 2   Name       5 non-null      object
 3   Join_Date  5 non-null      object
dtypes: object(4)
memory usage: 288.0+ bytes
None

Dataset After Converting Data Types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Age        5 non-null      int64  
 1   Salary     5 non-null      float64
 2   Name       0 non-null      float64
 3   Join_Date  0 non-null      float64
dtypes: float64(3), int64(1)
memory usage: 288.0 bytes
None
   Age   Salary  Name  Join_Date
0   25  50000.0   NaN        NaN
1   30  55000.5   NaN        NaN
2   22  48000.0   NaN        NaN
3   28  52000.0   NaN       

In [5]:
import pandas as pd

# Example dataset with a categorical column
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Department': ['HR', 'Finance', 'IT', 'HR', 'Finance'],  # Categorical column
    'Salary': [50000, 55000, 48000, 52000, 60000]
}

df = pd.DataFrame(data)

print("Original Dataset:")
print(df)

# Perform one-hot encoding on the 'Department' column
df_encoded = pd.get_dummies(df, columns=['Department'], prefix='Dept')

print("\nDataset After One-Hot Encoding:")
print(df_encoded)


Original Dataset:
      Name Department  Salary
0    Alice         HR   50000
1      Bob    Finance   55000
2  Charlie         IT   48000
3    David         HR   52000
4      Eva    Finance   60000

Dataset After One-Hot Encoding:
      Name  Salary  Dept_Finance  Dept_HR  Dept_IT
0    Alice   50000         False     True    False
1      Bob   55000          True    False    False
2  Charlie   48000         False    False     True
3    David   52000         False     True    False
4      Eva   60000          True    False    False
