In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn.impute import KNNImputer

In [11]:
# Load a dataset from a CSV file
df = pd.read_csv(r"C:\Users\Vaishnavi Kadam\Desktop\Python\ML_KNN\Data.csv")

In [12]:
# Display the first few rows of the dataset
print(df.head())

   ID       Name   Age  Gender   Salary  Target
0   1      Sarah  25.0  Female  50000.0       0
1   2     Ophrah  30.0    Male  60000.0       1
2   3     Torben  22.0    Male  70000.0       0
3   4  Masharahu  35.0    Male  80000.0       1
4   5       Kaya   NaN  Female  55000.0       0


In [13]:
# Check for missing values in the dataset
missing_values = df.isna().sum()
print(missing_values)

ID        0
Name      0
Age       1
Gender    0
Salary    1
Target    0
dtype: int64


In [14]:
# Drop rows with missing values
df_cleaned = df.dropna()
print(df_cleaned)

   ID       Name   Age  Gender   Salary  Target
0   1      Sarah  25.0  Female  50000.0       0
1   2     Ophrah  30.0    Male  60000.0       1
2   3     Torben  22.0    Male  70000.0       0
3   4  Masharahu  35.0    Male  80000.0       1


In [15]:
# Fill missing values in the 'Age' column with the mean of the column
df['Age'].fillna(df['Age'].mean(), inplace=True)
print(df)

   ID       Name   Age  Gender   Salary  Target
0   1      Sarah  25.0  Female  50000.0       0
1   2     Ophrah  30.0    Male  60000.0       1
2   3     Torben  22.0    Male  70000.0       0
3   4  Masharahu  35.0    Male  80000.0       1
4   5       Kaya  28.2  Female  55000.0       0
5   6    Abandon  29.0    Male      NaN       1


In [16]:
# Initialize the LabelEncoder
le = LabelEncoder()

# Apply label encoding to the 'Gender' column
df['Gender'] = le.fit_transform(df['Gender'])

# Output the encoded dataset
print(df)

   ID       Name   Age  Gender   Salary  Target
0   1      Sarah  25.0       0  50000.0       0
1   2     Ophrah  30.0       1  60000.0       1
2   3     Torben  22.0       1  70000.0       0
3   4  Masharahu  35.0       1  80000.0       1
4   5       Kaya  28.2       0  55000.0       0
5   6    Abandon  29.0       1      NaN       1


In [17]:
# One hot encoding is used to transform categorical data into a numerical format that models can understand by creating binary columns
# Apply one-hot encoding to the 'Gender' column
df_encoded = pd.get_dummies(df, columns=['Gender'])

# Output the one-hot encoded dataset
print(df_encoded)

   ID       Name   Age   Salary  Target  Gender_0  Gender_1
0   1      Sarah  25.0  50000.0       0         1         0
1   2     Ophrah  30.0  60000.0       1         0         1
2   3     Torben  22.0  70000.0       0         0         1
3   4  Masharahu  35.0  80000.0       1         0         1
4   5       Kaya  28.2  55000.0       0         1         0
5   6    Abandon  29.0      NaN       1         0         1


In [18]:
#Min - Max scalr is used to approximate upper and lower bounds of the dataset 
# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Apply Min-Max scaling to the 'Age' and 'Salary' columns
df[['Age', 'Salary']] = scaler.fit_transform(df[['Age', 'Salary']])

# Output the scaled dataset
print(df)

   ID       Name       Age  Gender    Salary  Target
0   1      Sarah  0.230769       0  0.000000       0
1   2     Ophrah  0.615385       1  0.333333       1
2   3     Torben  0.000000       1  0.666667       0
3   4  Masharahu  1.000000       1  1.000000       1
4   5       Kaya  0.476923       0  0.166667       0
5   6    Abandon  0.538462       1       NaN       1


In [19]:
#Z-score scaling, or standardization, transforms data to have a mean of 0 and a standard deviation of 1, making features comparable and preventing those with larger scales from dominating the learning process
# Initialize the StandardScaler
scaler = StandardScaler()

# Apply Z-score scaling to the 'Age' and 'Salary' columns
df[['Age', 'Salary']] = scaler.fit_transform(df[['Age', 'Salary']])

# Output the standardized dataset
print(df)

   ID       Name           Age  Gender    Salary  Target
0   1      Sarah -7.885825e-01       0 -1.207020       0
1   2     Ophrah  4.435777e-01       1 -0.278543       1
2   3     Torben -1.527879e+00       1  0.649934       0
3   4  Masharahu  1.675738e+00       1  1.578410       1
4   5       Kaya  5.335093e-16       0 -0.742781       0
5   6    Abandon  1.971456e-01       1       NaN       1


In [20]:
# Split the dataset into features and target
X = df.drop('Target', axis=1)
y = df['Target']

# Split the dataset into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Output the size of the training and testing sets
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 4
Testing set size: 2


In [22]:
#Removing outliers from the Dataset using Z-Score method in Pandas
#Removing outliers in machine learning is crucial because they can significantly skew results, leading to inaccurate predictions and poor model performance, especially for algorithms sensitive to extreme values. 

z_scores = np.abs(stats.zscore(df['Age']))
df_cleaned = df[z_scores < 3] 

print(df_cleaned)

   ID       Name           Age  Gender    Salary  Target
0   1      Sarah -7.885825e-01       0 -1.207020       0
1   2     Ophrah  4.435777e-01       1 -0.278543       1
2   3     Torben -1.527879e+00       1  0.649934       0
3   4  Masharahu  1.675738e+00       1  1.578410       1
4   5       Kaya  5.335093e-16       0 -0.742781       0
5   6    Abandon  1.971456e-01       1       NaN       1


In [25]:
#Imputing missing values using KNN imputation in Pandas
#KNN imputation is a data preprocessing technique that fills missing values by using the values of the k nearest neighbors, leveraging the similarity between data points to estimate missing values. 

# Separate the numeric columns (Age and Salary) from non-numeric ones (Name, Gender)
numeric_cols = ['Age', 'Salary']
non_numeric_cols = ['ID', 'Name', 'Gender', 'Target']

# Apply KNN imputation only to the numeric columns
imputer = KNNImputer(n_neighbors=3)
df_numeric_imputed = pd.DataFrame(imputer.fit_transform(df[numeric_cols]), columns=numeric_cols)

# Combine the non-numeric columns with the imputed numeric data
df_imputed = pd.concat([df[non_numeric_cols].reset_index(drop=True), df_numeric_imputed], axis=1)

# Output the dataset with imputed values
print(df_imputed)

   ID       Name  Gender  Target           Age    Salary
0   1      Sarah       0       0 -7.885825e-01 -1.207020
1   2     Ophrah       1       1  4.435777e-01 -0.278543
2   3     Torben       1       0 -1.527879e+00  0.649934
3   4  Masharahu       1       1  1.675738e+00  1.578410
4   5       Kaya       0       0  5.335093e-16 -0.742781
5   6    Abandon       1       1  1.971456e-01 -0.742781
