<a href="https://colab.research.google.com/github/uvindu827/Data_preprocessing/blob/main/fdm_data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Handling missinng values

In [4]:
import pandas as pd
import numpy as np

data = {
    'Age': [25, 30, np.nan, 35, 40, 45, np.nan, 50],
    'Salary': [50000, 54000, 52000, np.nan, 62000, np.nan, 58000, 60000],
    'Department': ['IT', 'Finance', np.nan, 'Finance', 'IT', 'HR', 'HR', 'IT'],
    'Experience': [2, 4, np.nan, 6, 8, 10, 12, np.nan]
}

df = pd.DataFrame(data)
print("\nOriginal dataframe")
print(df)


Original dataframe


**Stratergy 1**

*remove rows with missing values*

In [3]:
df_dropna = df.dropna()
print("\nAfter removing rows with null values")
print(df_dropna)


After removing rows with null values
    Age   Salary Department  Experience
0  25.0  50000.0         IT         2.0
1  30.0  54000.0    Finance         4.0
4  40.0  62000.0         IT         8.0


**stratergy 2**

*Fill with global constant*

In [5]:
df_fill_constant = df.fillna("Unknown")
print("\nAfter fillinng missing value with global constants")
print(df_fill_constant)


After fillinng missing value with global constants
       Age   Salary Department Experience
0     25.0  50000.0         IT        2.0
1     30.0  54000.0    Finance        4.0
2  Unknown  52000.0    Unknown    Unknown
3     35.0  Unknown    Finance        6.0
4     40.0  62000.0         IT        8.0
5     45.0  Unknown         HR       10.0
6  Unknown  58000.0         HR       12.0
7     50.0  60000.0         IT    Unknown


**stratergy 3**

*Numerical values - Fill with mean*

In [9]:
from sklearn.impute import SimpleImputer

mean_imputer = SimpleImputer(strategy='mean')
median_imputer = SimpleImputer(strategy='median')
mode_imputer = SimpleImputer(strategy='most_frequent')

df_numeric = df.select_dtypes(include=[np.number])
df_numeric_mean = pd.DataFrame(mean_imputer.fit_transform(df_numeric))

df_cat = df.select_dtypes(exclude=[np.number])
df_cat_mode = pd.DataFrame(mode_imputer.fit_transform(df_cat))

df_fill_avg = pd.concat([df_numeric_mean, df_cat_mode], axis=1)
print("\nAfter filling with mean (numeric) and mode (categorical):")
print(df_fill_avg)




After filling with mean (numeric) and mode (categorical):
      0        1     2        0
0  25.0  50000.0   2.0       IT
1  30.0  54000.0   4.0  Finance
2  37.5  52000.0   7.0       IT
3  35.0  56000.0   6.0  Finance
4  40.0  62000.0   8.0       IT
5  45.0  56000.0  10.0       HR
6  37.5  58000.0  12.0       HR
7  50.0  60000.0   7.0       IT


In [11]:
df_numeric = df.select_dtypes(include=[np.number])
df_numeric_median = pd.DataFrame(median_imputer.fit_transform(df_numeric))

df_cat = df.select_dtypes(exclude=[np.number])
df_cat_mode = pd.DataFrame(mode_imputer.fit_transform(df_cat))

df_fill_avg_1 = pd.concat([df_numeric_median, df_cat_mode], axis=1)
print("\nAfter filling with mean (numeric) and mode (categorical):")
print(df_fill_avg_1)


After filling with mean (numeric) and mode (categorical):
      0        1     2        0
0  25.0  50000.0   2.0       IT
1  30.0  54000.0   4.0  Finance
2  37.5  52000.0   7.0       IT
3  35.0  56000.0   6.0  Finance
4  40.0  62000.0   8.0       IT
5  45.0  56000.0  10.0       HR
6  37.5  58000.0  12.0       HR
7  50.0  60000.0   7.0       IT


#Data Bining

In [12]:
np.random.seed(42)
data = np.random.randint(18,70, size=100)


array([56, 69, 46, 32, 60, 25, 38, 56, 36, 40, 28, 28, 41, 53, 57, 41, 20,
       39, 19, 41, 61, 47, 55, 19, 38, 50, 29, 39, 61, 42, 66, 44, 59, 45,
       33, 32, 64, 68, 61, 69, 20, 54, 68, 24, 38, 26, 56, 35, 21, 42, 31,
       67, 26, 43, 19, 37, 45, 64, 24, 61, 25, 64, 52, 31, 34, 53, 67, 57,
       21, 19, 23, 59, 21, 46, 35, 43, 61, 51, 27, 53, 31, 48, 65, 32, 25,
       31, 40, 57, 38, 33, 62, 35, 64, 41, 43, 42, 62, 58, 46, 32])

**Equal width biinning**

In [14]:
num_bins = 5
min = data.min()
max = data.max()
bin_width = (max-min)/num_bins

bins = [min + i * bin_width for i in range(num_bins + 1)]
labels = [f'Bin {i+1}' for i in range(num_bins)]

df_binned = pd.cut(data, bins=bins, labels=labels, include_lowest=True)
print("\nEqual-width binning results:")
print(pd.value_counts(df_binned))


Equal-width binning results:
Bin 2    22
Bin 1    21
Bin 3    20
Bin 5    20
Bin 4    17
Name: count, dtype: int64


  print(pd.value_counts(df_binned))


**Equal depth binning**

In [15]:
df_binned_eq_freq = pd.qcut(data, q=num_bins, labels=labels)
print("\nEqual-frequency binning results:")
print(pd.value_counts(df_binned_eq_freq))


Equal-frequency binning results:
Bin 2    21
Bin 1    20
Bin 3    20
Bin 5    20
Bin 4    19
Name: count, dtype: int64


  print(pd.value_counts(df_binned_eq_freq))


**Bin smoothing by mean**

In [17]:
df_smoothed = df_binned.copy()
bin_means = {}

for label in labels:
    indices = np.where(df_binned == label)[0]
    bin_values = data[indices]
    bin_mean = np.mean(bin_values)
    bin_means[label] = bin_mean
    data[indices] = bin_mean

print("\nData after bin smoothing by mean:")
print(data)


Data after bin smoothing by mean:
[55 64 43 34 64 23 34 55 34 43 23 23 43 55 55 43 23 34 23 43 64 43 55 23
 34 55 23 34 64 43 64 43 55 43 34 34 64 64 64 64 23 55 64 23 34 23 55 34
 23 43 34 64 23 43 23 34 43 64 23 64 23 64 55 34 34 55 64 55 23 23 23 55
 23 43 34 43 64 55 23 55 34 43 64 34 23 34 43 55 34 34 64 34 64 43 43 43
 64 55 43 34]
