pip install scikit-learn

In [1]:
import pandas as pd #pandas for data preprocessing
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.impute import  SimpleImputer
from sklearn.pipeline import Pipeline

In [5]:
penguins = sns.load_dataset("penguins")
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [6]:
penguins.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

Separate Numerical and Categorical Columns

In [7]:
num_cols = penguins.select_dtypes('number').columns
print(num_cols.tolist())
# finding number columns

['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']


In [8]:
cat_cols = penguins.select_dtypes(exclude='number').columns
print(cat_cols.tolist())
# finding categorical columns

['species', 'island', 'sex']


Handling missing values using SimpleImputer

In [9]:
num_imp = SimpleImputer() # default strategy is 'mean'
cat_imp = SimpleImputer(strategy='most_frequent')

In [10]:
penguins[num_cols] = num_imp.fit_transform(penguins[num_cols])
penguins.isnull().sum()

species               0
island                0
bill_length_mm        0
bill_depth_mm         0
flipper_length_mm     0
body_mass_g           0
sex                  11
dtype: int64

In [11]:
penguins[cat_cols] = cat_imp.fit_transform(penguins[cat_cols])
penguins.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

Encoding Categorical Values

drop = first, to avoid dummy variable trap.....When we do OneHotEncoding, it creates a new column for each category. if you encode n columns, it creates n new columns. But among all those columns one column can be predicted with the combination of all the columns, this is called dummy variable trap, and it avoids the situation of multicollinearity (which effects linear models)

Multicollinearity means when more than one independent variable os strongly correlated with another independent variable.

In [12]:
sex_enc = OneHotEncoder(drop='first')
sex_dummy = sex_enc.fit_transform(penguins[['sex']]).toarray()
sex_dummy_df = pd.DataFrame(sex_dummy)

In [13]:
sex_dummy_df

Unnamed: 0,0
0,1.0
1,0.0
2,0.0
3,1.0
4,0.0
...,...
339,1.0
340,0.0
341,1.0
342,0.0


In [14]:
cat_cols = ['species', 'island']
cat_enc = OneHotEncoder(drop='first')
dummy_cols = cat_enc.fit_transform(penguins[cat_cols]).toarray()
dummy_df = pd.DataFrame(dummy_cols)

In [15]:
penguins['species'].value_counts()

species
Adelie       152
Gentoo       124
Chinstrap     68
Name: count, dtype: int64

In [16]:
penguins['island'].value_counts()

island
Biscoe       168
Dream        124
Torgersen     52
Name: count, dtype: int64

In [17]:
dummy_df

Unnamed: 0,0,1,2,3
0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0
...,...,...,...,...
339,0.0,1.0,0.0,0.0
340,0.0,1.0,0.0,0.0
341,0.0,1.0,0.0,0.0
342,0.0,1.0,0.0,0.0


In [18]:
clean_df = pd.concat([penguins, dummy_df, sex_dummy_df],
                         axis=1).drop(columns=cat_cols+['sex'])
clean_df

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,0,1,2,3,0.1
0,39.10000,18.70000,181.000000,3750.000000,0.0,0.0,0.0,1.0,1.0
1,39.50000,17.40000,186.000000,3800.000000,0.0,0.0,0.0,1.0,0.0
2,40.30000,18.00000,195.000000,3250.000000,0.0,0.0,0.0,1.0,0.0
3,43.92193,17.15117,200.915205,4201.754386,0.0,0.0,0.0,1.0,1.0
4,36.70000,19.30000,193.000000,3450.000000,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
339,43.92193,17.15117,200.915205,4201.754386,0.0,1.0,0.0,0.0,1.0
340,46.80000,14.30000,215.000000,4850.000000,0.0,1.0,0.0,0.0,0.0
341,50.40000,15.70000,222.000000,5750.000000,0.0,1.0,0.0,0.0,1.0
342,45.20000,14.80000,212.000000,5200.000000,0.0,1.0,0.0,0.0,0.0


Scale Numeric Values

In [19]:
scaler = StandardScaler()
clean_df[num_cols] = scaler.fit_transform(clean_df[num_cols])
clean_df

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,0,1,2,3,0.1
0,-8.870812e-01,7.877425e-01,-1.422488,-0.565789,0.0,0.0,0.0,1.0,1.0
1,-8.134940e-01,1.265563e-01,-1.065352,-0.503168,0.0,0.0,0.0,1.0,0.0
2,-6.663195e-01,4.317192e-01,-0.422507,-1.192003,0.0,0.0,0.0,1.0,0.0
3,-1.307172e-15,1.806927e-15,0.000000,0.000000,0.0,0.0,0.0,1.0,1.0
4,-1.328605e+00,1.092905e+00,-0.565361,-0.941517,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
339,-1.307172e-15,1.806927e-15,0.000000,0.000000,0.0,1.0,0.0,0.0,1.0
340,5.294731e-01,-1.450118e+00,1.006038,0.811880,0.0,1.0,0.0,0.0,0.0
341,1.191758e+00,-7.380718e-01,1.506028,1.939064,0.0,1.0,0.0,0.0,1.0
342,2.351241e-01,-1.195816e+00,0.791756,1.250229,0.0,1.0,0.0,0.0,0.0
