<a href="https://colab.research.google.com/github/tbarnette70/tbarnette70/blob/main/Krakowiak_scikitlearn_simpleimputer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd


data = {
    'size': ['XL', 'L', 'M', np.nan, 'M', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red', 'green'],
    'gender': ['female', 'male', np.nan, 'female', 'female', 'male'],
    'price': [199.0, 89.0, np.nan, 129.0, 79.0, 89.0],
    'weight': [500, 450, 300, np.nan, 410, np.nan],
    'bought': ['yes', 'no', 'yes', 'no', 'yes', 'no']
}

df = pd.DataFrame(data=data)
print(np.round(df.isnull().sum() / len(df), 2))

size      0.17
color     0.00
gender    0.17
price     0.17
weight    0.33
bought    0.00
dtype: float64



[scikitlearn-simpleimputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html)


In [3]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
 
 
data = {
    'size': ['XL', 'L', 'M', np.nan, 'M', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red', 'green'],
    'gender': ['female', 'male', np.nan, 'female', 'female', 'male'],
    'price': [199.0, 89.0, np.nan, 129.0, 79.0, 89.0],
    'weight': [500, 450, 300, np.nan, 410, np.nan],
    'bought': ['yes', 'no', 'yes', 'no', 'yes', 'no']
}
 
df = pd.DataFrame(data=data)
print(df)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df[['weight']] = imputer.fit_transform(df[['weight']])
print(df)

  size  color  gender  price  weight bought
0   XL    red  female  199.0   500.0    yes
1    L  green    male   89.0   450.0     no
2    M   blue     NaN    NaN   300.0    yes
3  NaN  green  female  129.0     NaN     no
4    M    red  female   79.0   410.0    yes
5    M  green    male   89.0     NaN     no
  size  color  gender  price  weight bought
0   XL    red  female  199.0   500.0    yes
1    L  green    male   89.0   450.0     no
2    M   blue     NaN    NaN   300.0    yes
3  NaN  green  female  129.0   415.0     no
4    M    red  female   79.0   410.0    yes
5    M  green    male   89.0   415.0     no


In [4]:
print(imputer.statistics_[0])

415.0


In [5]:
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=99.0)
df[['price']] = imputer.fit_transform(df[['price']])
print(df)

  size  color  gender  price  weight bought
0   XL    red  female  199.0   500.0    yes
1    L  green    male   89.0   450.0     no
2    M   blue     NaN   99.0   300.0    yes
3  NaN  green  female  129.0   415.0     no
4    M    red  female   79.0   410.0    yes
5    M  green    male   89.0   415.0     no


In [6]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df[['size']] = imputer.fit_transform(df[['size']])
print(df)

  size  color  gender  price  weight bought
0   XL    red  female  199.0   500.0    yes
1    L  green    male   89.0   450.0     no
2    M   blue     NaN   99.0   300.0    yes
3    M  green  female  129.0   415.0     no
4    M    red  female   79.0   410.0    yes
5    M  green    male   89.0   415.0     no


In [7]:
print(df[~df['weight'].isnull()].mean())


price     114.0
weight    415.0
dtype: float64


  """Entry point for launching an IPython kernel.


In [8]:
df_object = df.select_dtypes(include=['object']).fillna('empty')
print(df_object)

  size  color  gender bought
0   XL    red  female    yes
1    L  green    male     no
2    M   blue   empty    yes
3    M  green  female     no
4    M    red  female    yes
5    M  green    male     no


In [9]:
df = pd.DataFrame(data={'weight': [75., 78.5, 85., 91., 84.5, 83., 68.]})
df['weight_cut'] = pd.cut(df['weight'], bins=3)
print(df)

   weight        weight_cut
0    75.0  (67.977, 75.667]
1    78.5  (75.667, 83.333]
2    85.0    (83.333, 91.0]
3    91.0    (83.333, 91.0]
4    84.5    (83.333, 91.0]
5    83.0  (75.667, 83.333]
6    68.0  (67.977, 75.667]


In [10]:
df['weight_cut'] = pd.cut(df['weight'], bins=(60, 75, 80, 95))
print(df)

   weight weight_cut
0    75.0   (60, 75]
1    78.5   (75, 80]
2    85.0   (80, 95]
3    91.0   (80, 95]
4    84.5   (80, 95]
5    83.0   (80, 95]
6    68.0   (60, 75]


In [11]:
df['weight_cut'] = pd.cut(df['weight'], bins=(60, 75, 80, 95), labels=['light', 'normal', 'heavy'])
print(df)

   weight weight_cut
0    75.0      light
1    78.5     normal
2    85.0      heavy
3    91.0      heavy
4    84.5      heavy
5    83.0      heavy
6    68.0      light
