In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer

try:
    diabetes_df = pd.read_csv('diabetes.csv')
    adult_df = pd.read_csv('adult.csv')
except FileNotFoundError:
    print("Error: Please upload 'diabetes.csv' and 'adult.csv' to your Google Colab environment.")
    exit()

In [None]:
diabetes_df.head(10)

Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
0,502,17975,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
1,735,34221,M,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,N
2,420,47975,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
3,680,87656,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
4,504,34223,M,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,N
5,634,34224,F,45,2.3,24,4.0,2.9,1.0,1.0,1.5,0.4,21.0,N
6,721,34225,F,50,2.0,50,4.0,3.6,1.3,0.9,2.1,0.6,24.0,N
7,421,34227,M,48,4.7,47,4.0,2.9,0.8,0.9,1.6,0.4,24.0,N
8,670,34229,M,43,2.6,67,4.0,3.8,0.9,2.4,3.7,1.0,21.0,N
9,759,34230,F,32,3.6,28,4.0,3.8,2.0,2.4,3.8,1.0,24.0,N


In [None]:
adult_df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
6,29,?,227026,HS-grad,9,Never-married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K
7,63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,32,United-States,>50K
8,24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K
9,55,Private,104996,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,10,United-States,<=50K


In [None]:
diabetes_df.shape

(1000, 14)

In [None]:
adult_df.shape

(48842, 15)

In [None]:
# Step 1: Identify numeric and categorical columns
diabetes_numeric_cols = diabetes_df.select_dtypes(include=[np.number]).columns
diabetes_categorical_cols = diabetes_df.select_dtypes(exclude=[np.number]).columns

adult_numeric_cols = adult_df.select_dtypes(include=[np.number]).columns
adult_categorical_cols = adult_df.select_dtypes(exclude=[np.number]).columns

# Step 2: Handle missing values in numeric columns using mean imputation
diabetes_numeric_imputer = SimpleImputer(strategy='mean')
adult_numeric_imputer = SimpleImputer(strategy='mean')

# Apply imputer for numeric columns
diabetes_df[diabetes_numeric_cols] = diabetes_numeric_imputer.fit_transform(diabetes_df[diabetes_numeric_cols])
adult_df[adult_numeric_cols] = adult_numeric_imputer.fit_transform(adult_df[adult_numeric_cols])

# Step 3: Handle missing values in categorical columns using most frequent imputation
diabetes_categorical_imputer = SimpleImputer(strategy='most_frequent')
adult_categorical_imputer = SimpleImputer(strategy='most_frequent')

# Apply imputer for categorical columns
diabetes_df[diabetes_categorical_cols] = diabetes_categorical_imputer.fit_transform(diabetes_df[diabetes_categorical_cols])
adult_df[adult_categorical_cols] = adult_categorical_imputer.fit_transform(adult_df[adult_categorical_cols])

# Step 4: Verify if any missing values remain
print("Missing values in Diabetes dataset after imputation:")
print(diabetes_df.isnull().sum())

print("Missing values in Adult Income dataset after imputation:")
print(adult_df.isnull().sum())


Missing values in Diabetes dataset after imputation:
ID           0
No_Pation    0
Gender       0
AGE          0
Urea         0
Cr           0
HbA1c        0
Chol         0
TG           0
HDL          0
LDL          0
VLDL         0
BMI          0
CLASS        0
dtype: int64
Missing values in Adult Income dataset after imputation:
age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64


In [None]:
from sklearn.preprocessing import LabelEncoder

# Label Encoder initialization
label_encoder = LabelEncoder()

# Encode categorical columns in Diabetes dataset
for col in diabetes_categorical_cols:
    diabetes_df[col] = label_encoder.fit_transform(diabetes_df[col])

# Encode categorical columns in Adult Income dataset
for col in adult_categorical_cols:
    adult_df[col] = label_encoder.fit_transform(adult_df[col])

# Check the encoded columns
print("Encoded columns in Diabetes dataset:")
print(diabetes_df.head())

print("Encoded columns in Adult Income dataset:")
print(adult_df.head())


Encoded columns in Diabetes dataset:
      ID  No_Pation  Gender   AGE  Urea    Cr  HbA1c  Chol   TG  HDL  LDL  \
0  502.0    17975.0       0  50.0   4.7  46.0    4.9   4.2  0.9  2.4  1.4   
1  735.0    34221.0       1  26.0   4.5  62.0    4.9   3.7  1.4  1.1  2.1   
2  420.0    47975.0       0  50.0   4.7  46.0    4.9   4.2  0.9  2.4  1.4   
3  680.0    87656.0       0  50.0   4.7  46.0    4.9   4.2  0.9  2.4  1.4   
4  504.0    34223.0       1  33.0   7.1  46.0    4.9   4.9  1.0  0.8  2.0   

   VLDL   BMI  CLASS  
0   0.5  24.0      0  
1   0.6  23.0      0  
2   0.5  24.0      0  
3   0.5  24.0      0  
4   0.4  21.0      0  
Encoded columns in Adult Income dataset:
    age  workclass    fnlwgt  education  educational-num  marital-status  \
0  25.0          4  226802.0          1              7.0               4   
1  38.0          4   89814.0         11              9.0               2   
2  28.0          2  336951.0          7             12.0               2   
3  44.0          

In [None]:
def remove_outliers(df):
    # Calculate Q1, Q3, and IQR
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1

    # Identify outliers
    df_no_outliers = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
    return df_no_outliers

# Apply outlier removal to both datasets
diabetes_df_no_outliers = remove_outliers(diabetes_df)
adult_df_no_outliers = remove_outliers(adult_df)

# Check data after removing outliers
print("Diabetes dataset shape after removing outliers:", diabetes_df_no_outliers.shape)
print("Adult Income dataset shape after removing outliers:", adult_df_no_outliers.shape)

Diabetes dataset shape after removing outliers: (556, 14)
Adult Income dataset shape after removing outliers: (11844, 15)


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize Min-Max Scaler
min_max_scaler = MinMaxScaler()

# Apply Min-Max Scaling to both datasets
diabetes_scaled_minmax = pd.DataFrame(min_max_scaler.fit_transform(diabetes_df_no_outliers), columns=diabetes_df_no_outliers.columns)
adult_scaled_minmax = pd.DataFrame(min_max_scaler.fit_transform(adult_df_no_outliers), columns=adult_df_no_outliers.columns)

# Check data after Min-Max scaling
print("Diabetes dataset after Min-Max scaling:")
print(diabetes_scaled_minmax.head())

print("Adult Income dataset after Min-Max scaling:")
print(adult_scaled_minmax.head())

Diabetes dataset after Min-Max scaling:
         ID  No_Pation  Gender       AGE      Urea        Cr     HbA1c  \
0  0.892231   0.452186     0.5  0.193548  0.328947  0.690476  0.230769   
1  0.106516   0.316206     0.0  0.354839  0.381579  0.404762  0.900000   
2  0.286967   0.188926     0.0  0.354839  0.381579  0.404762  0.900000   
3  0.630326   0.303545     0.0  0.290323  0.381579  0.190476  0.346154   
4  0.006266   0.452318     0.0  0.225806  0.250000  0.440476  0.238462   

       Chol        TG  HDL       LDL      VLDL   BMI  CLASS  
0  0.694915  0.723404  0.2  0.673469  0.761905  0.15    0.0  
1  0.406780  0.361702  0.4  0.387755  0.380952  0.50    0.0  
2  0.406780  0.361702  0.4  0.387755  0.380952  0.50    0.0  
3  0.406780  0.425532  0.6  0.326531  0.428571  0.20    0.0  
4  0.627119  0.744681  0.6  0.448980  0.761905  0.25    0.0  
Adult Income dataset after Min-Max scaling:
        age  workclass    fnlwgt  education  educational-num  marital-status  \
0  0.344262        

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize Standard Scaler
standard_scaler = StandardScaler()

# Apply Standard Scaling to both datasets
diabetes_scaled_standard = pd.DataFrame(standard_scaler.fit_transform(diabetes_df_no_outliers), columns=diabetes_df_no_outliers.columns)
adult_scaled_standard = pd.DataFrame(standard_scaler.fit_transform(adult_df_no_outliers), columns=adult_df_no_outliers.columns)

# Check data after Standard scaling
print("Diabetes dataset after Standard scaling:")
print(diabetes_scaled_standard.head())

print("Adult Income dataset after Standard scaling:")
print(adult_scaled_standard.head())


Diabetes dataset after Standard scaling:
         ID  No_Pation    Gender       AGE      Urea        Cr     HbA1c  \
0  1.634136   0.322377  0.947475 -2.344564 -0.661056  1.293338 -1.688590   
1 -1.060798  -0.421699 -1.040434 -1.280766 -0.395535 -0.104775  2.040328   
2 -0.441866  -1.118165 -1.040434 -1.280766 -0.395535 -0.104775  2.040328   
3  0.735825  -0.490976 -1.040434 -1.706285 -0.395535 -1.153359 -1.045673   
4 -1.404650   0.323102 -1.040434 -2.131805 -1.059337  0.069989 -1.645729   

       Chol        TG       HDL       LDL      VLDL       BMI  CLASS  
0  1.126537  1.485468 -1.284433  1.348145  1.482732 -2.141359    0.0  
1 -0.410418 -0.195241 -0.352859 -0.059949 -0.268049 -0.394447    0.0  
2 -0.410418 -0.195241 -0.352859 -0.059949 -0.268049 -0.394447    0.0  
3 -0.410418  0.101355  0.578715 -0.361683 -0.049201 -1.891800    0.0  
4  0.764900  1.584333  0.578715  0.241786  1.482732 -1.642241    0.0  
Adult Income dataset after Standard scaling:
        age  workclass    fnlwg