In [50]:
import numpy as np 
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

* Below method will take dataframe and  column name and replace NaN values with given list .

In [52]:
def replace_nan_with_random_list(df, column, replacement_list):
    nan_indices = df.index[df[column].isna()].tolist()
    num_nans = len(nan_indices)
    random_values = np.random.choice(replacement_list, size=num_nans)
    df.loc[nan_indices, column] = random_values

* Below method will take dataframe and  column name and replace NaN values with non NAN values from that column.
  It will select a random value from non NAN value

In [53]:
def replace_nan_with_existing_val(df, column):
    nan_indices = df.index[df[column].isna()].tolist()
    num_nans = len(nan_indices)
    # check where values are not Nan values , that will make a replacement_list
    value_index = df.index[df.column.notna()].tolist()
    replacement_list = df.loc[value_index,column].values.tolist()
    random_values = np.random.choice(replacement_list, size=num_nans)
    df.loc[nan_indices, column] = random_values

In [68]:
def preprocess_dataframe(numerical_features, categorical_features, df):
    """
    This functions takes a dataframe and takes a list of numerical columns and categorical columns 
    and replaces NAN values and scales the values 
    
    """
    # Create transformers for numerical and categorical features
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine transformers using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Fit and transform the data using the preprocessor
    X_preprocessed = preprocessor.fit_transform(df)
    # get output features name 
    feature_names = preprocessor.get_feature_names_out()
    
    return ( X_preprocessed ,feature_names)



In [4]:
data = {'A': [1, 2, np.nan, 4, 5, np.nan, 7, 8, 9]}

In [5]:
df = pd.DataFrame(data)

In [6]:
df

Unnamed: 0,A
0,1.0
1,2.0
2,
3,4.0
4,5.0
5,
6,7.0
7,8.0
8,9.0


In [11]:
# get the NaN index
nan_index = df.index[df.A.isna()].tolist()
len(nan_index)

2

In [12]:
# got the non nan index
value_index = df.index[df.A.notna()].tolist()
len(value_index)

7

In [18]:
replacement_list = df.loc[value_index,'A'].values.tolist()

In [19]:
replacement_list

[1.0, 2.0, 4.0, 5.0, 7.0, 8.0, 9.0]

In [20]:
replace_nan_with_random_list(df,'A',replacement_list=replacement_list)

In [21]:
df

Unnamed: 0,A
0,1.0
1,2.0
2,8.0
3,4.0
4,5.0
5,7.0
6,7.0
7,8.0
8,9.0


In [24]:
data2 = { 'A' : [ 77 , 88 ,99 , np.nan, 110 , 122, 133, np.nan] }
df2 = pd.DataFrame(data2)
df2

Unnamed: 0,A
0,77.0
1,88.0
2,99.0
3,
4,110.0
5,122.0
6,133.0
7,


In [48]:
%whos

Variable                        Type         Data/Info
------------------------------------------------------
data                            dict         n=1
data2                           dict         n=1
data4                           dict         n=3
data5                           dict         n=3
df                              DataFrame         A\n0  1.0\n1  2.0\n2<...>0\n6  7.0\n7  8.0\n8  9.0
df2                             DataFrame           A\n0   77.0\n1   8<...>122.0\n6  133.0\n7  133.0
df3                             DataFrame       C\n0  7\n1  8\n2  9
df4                             DataFrame          A       B      C\n0<...>N\n3  None     NaN  False
nan_index                       list         n=2
np                              module       <module 'numpy' from 'C:\<...>ges\\numpy\\__init__.py'>
pd                              module       <module 'pandas' from 'C:<...>es\\pandas\\__init__.py'>
preprocess_dataframe            function     <function preprocess_data<.

In [29]:
# replace NAN with existing values in the dataframe 
replace_nan_with_existing_val (df2,'A')

In [30]:
df2

Unnamed: 0,A
0,77.0
1,88.0
2,99.0
3,88.0
4,110.0
5,122.0
6,133.0
7,133.0


In [32]:
data5 = { 'A' : [ 1, 2,3] , 'B' : [4,5,6] , 'C' : [ 7,8,9] }
data5

{'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}

In [33]:
df3 = pd.DataFrame(data5)

In [34]:
df3

Unnamed: 0,A,B,C
0,1,4,7
1,2,5,8
2,3,6,9


In [36]:
# drop method allows multiple 
df3.drop(columns = [ 'A','B'] ,inplace=True)

In [37]:
df3

Unnamed: 0,C
0,7
1,8
2,9


In [61]:
data4 = { 'A' : [ 'A1','A2','A3','A4' ] , 'B' : [ 1 ,10 , 1000, np.nan] , 'C' : [ True, False , False , False] }

In [62]:
data4

{'A': ['A1', 'A2', 'A3', 'A4'],
 'B': [1, 10, 1000, nan],
 'C': [True, False, False, False]}

In [63]:
df4 = pd.DataFrame(data4)

In [64]:
df4

Unnamed: 0,A,B,C
0,A1,1.0,True
1,A2,10.0,False
2,A3,1000.0,False
3,A4,,False


In [69]:
numeric_columns =  ['B']
catergorical_columns = [ 'A','C']
X , feature_names= preprocess_dataframe(numeric_columns,catergorical_columns,df4)

In [70]:
X

array([[-0.82755483,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ],
       [-0.80538818,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ,  0.        ],
       [ 1.63294301,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         1.        ,  0.        ]])

In [67]:
X.shape

(4, 7)

In [71]:
feature_names

array(['num__B', 'cat__A_A1', 'cat__A_A2', 'cat__A_A3', 'cat__A_A4',
       'cat__C_False', 'cat__C_True'], dtype=object)

In [72]:
df5 = pd.DataFrame(data = X , columns = feature_names)

In [73]:
df5

Unnamed: 0,num__B,cat__A_A1,cat__A_A2,cat__A_A3,cat__A_A4,cat__C_False,cat__C_True
0,-0.827555,1.0,0.0,0.0,0.0,0.0,1.0
1,-0.805388,0.0,1.0,0.0,0.0,1.0,0.0
2,1.632943,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0,1.0,0.0
