In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder


df=pd.read_csv('Diabetes.csv')


df_copy = df.copy()


df_copy["CLASS"] = df_copy["CLASS"].astype(str).str.strip()
df_copy["CLASS"] = df_copy["CLASS"].replace("", "Unknown")
df_copy["CLASS"].fillna("Unknown", inplace=True)


print("Unique values in CLASS column:", df_copy["CLASS"].unique())


onehot_encoder = OneHotEncoder(sparse_output=False, drop=None)
encoded_data = onehot_encoder.fit_transform(df_copy[["CLASS"]])
encoded_df = pd.DataFrame(encoded_data, columns=onehot_encoder.get_feature_names_out(["CLASS"]))


df_encoded = pd.concat([df_copy, encoded_df], axis=1)
df_encoded.drop("CLASS", axis=1, inplace=True)


df_encoded["Gender"] = df_encoded["Gender"].str.upper()
ordinal_encoder = OrdinalEncoder(categories=[["M", "F"]])
df_encoded["Gender_Encoded"] = ordinal_encoder.fit_transform(df_encoded[["Gender"]])
df_encoded.drop("Gender", axis=1, inplace=True)


output_file = "transformed_dataset.csv"
df_encoded.to_csv(output_file, index=False)


print(df_encoded.head())
print(f"Fixed dataset saved: {output_file}")

data=pd.read_csv('transformed_dataset.csv')
data.head(-50)


Unique values in CLASS column: ['N' 'P' 'Y']
    ID  No_Pation  AGE  Urea  Cr  HbA1c  Chol   TG  HDL  LDL  VLDL   BMI  \
0  502      17975   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5  24.0   
1  735      34221   26   4.5  62    4.9   3.7  1.4  1.1  2.1   0.6  23.0   
2  420      47975   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5  24.0   
3  680      87656   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5  24.0   
4  504      34223   33   7.1  46    4.9   4.9  1.0  0.8  2.0   0.4  21.0   

   CLASS_N  CLASS_P  CLASS_Y  Gender_Encoded  
0      1.0      0.0      0.0             1.0  
1      1.0      0.0      0.0             0.0  
2      1.0      0.0      0.0             1.0  
3      1.0      0.0      0.0             1.0  
4      1.0      0.0      0.0             0.0  
Fixed dataset saved: transformed_dataset.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy["CLASS"].fillna("Unknown", inplace=True)  # Fill missing values with "Unknown"


Unnamed: 0,ID,No_Pation,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS_N,CLASS_P,CLASS_Y,Gender_Encoded
0,502,17975,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,1.0,0.0,0.0,1.0
1,735,34221,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,1.0,0.0,0.0,0.0
2,420,47975,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,1.0,0.0,0.0,1.0
3,680,87656,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,1.0,0.0,0.0,1.0
4,504,34223,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
945,120,565474,51,4.4,67,6.8,6.7,3.7,0.9,2.9,1.7,32.0,0.0,0.0,1.0,0.0
946,121,65756,62,4.8,52,11.8,3.7,0.8,0.8,2.6,0.3,33.0,0.0,0.0,1.0,1.0
947,122,345,60,3.3,59,7.6,3.5,1.0,1.3,1.5,0.7,30.0,0.0,0.0,1.0,0.0
948,123,5676,60,3.4,27,14.7,3.5,1.9,1.3,1.6,1.4,35.0,0.0,0.0,1.0,1.0


In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder


def find_column_case_insensitive(df, target):
    for col in df.columns:
        if col.lower() == target.lower():
            return col
    return None


df = pd.read_csv('adult.csv')
df_copy = df.copy()


df_copy.loc[:, "income"] = df_copy["income"].astype(str).str.strip()
df_copy.loc[:, "income"] = df_copy["income"].replace("", "Unknown")
df_copy.loc[:, "income"] = df_copy["income"].fillna("Unknown")


print("Unique values in income column:", df_copy["income"].unique())


onehot_encoder = OneHotEncoder(sparse_output=False, drop=None)
encoded_data = onehot_encoder.fit_transform(df_copy[["income"]])
encoded_df = pd.DataFrame(encoded_data, columns=onehot_encoder.get_feature_names_out(["income"]))


df_encoded = pd.concat([df_copy, encoded_df], axis=1)
df_encoded.drop("income", axis=1, inplace=True)


gender_col = find_column_case_insensitive(df_encoded, "sex")
if gender_col is None:
    gender_col = find_column_case_insensitive(df_encoded, "gender")
if gender_col is None:
    raise KeyError("No column for 'sex' or 'gender' found in the dataset.")


df_encoded.loc[:, gender_col] = df_encoded[gender_col].astype(str).str.strip().str.upper()

ordinal_encoder = OrdinalEncoder(categories=[["MALE", "FEMALE"]])
df_encoded["Sex_Encoded"] = ordinal_encoder.fit_transform(df_encoded[[gender_col]])
df_encoded.drop(gender_col, axis=1, inplace=True)


output_file = "transformed_adult_dataset.csv"
df_encoded.to_csv(output_file, index=False)


print("\n📊 Preview of Transformed Dataset:")
print(df_encoded.head())
print(f"\n✅ Fixed dataset saved: {output_file}")


data = pd.read_csv(output_file)
print("\n📋 Sample of the final dataset:")
print(data.head(-50))


Unique values in income column: ['<=50K' '>50K']

📊 Preview of Transformed Dataset:
   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18          ?  103497  Some-college               10       Never-married   

          occupation relationship   race  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black             0             0   
1    Farming-fishing      Husband  White             0             0   
2    Protective-serv      Husband  White             0             0   
3  Machine-op-inspct      Husband  Black          7688             0   
4                  ?    Own-child  White             0             0   

  