# Scikit Learn Preprocessing

In [1]:
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd
from datetime import datetime

In [2]:
hvac = pd.read_csv('https://raw.githubusercontent.com/kjam/data-cleaning-101/master/data/HVAC_with_nulls.csv')

## Überprüfen der Datenqualität

In [3]:
hvac.dtypes

Date           object
Time           object
TargetTemp    float64
ActualTemp      int64
System          int64
SystemAge     float64
BuildingID      int64
10            float64
dtype: object

In [4]:
hvac.shape

(8000, 8)

In [5]:
hvac.head()

Unnamed: 0,Date,Time,TargetTemp,ActualTemp,System,SystemAge,BuildingID,10
0,6/1/13,0:00:01,66.0,58,13,20.0,4,
1,6/2/13,1:00:01,,68,3,20.0,17,
2,6/3/13,2:00:01,70.0,73,17,20.0,18,
3,6/4/13,3:00:01,67.0,63,2,,15,
4,6/5/13,4:00:01,68.0,74,16,9.0,3,


## Fehlenden Werten den Mittelwert zuschreiben

In [6]:
imp = SimpleImputer(missing_values=np.nan,
                    strategy='mean')

In [7]:
hvac_numeric = hvac[['TargetTemp', 'SystemAge']]

In [8]:
imp = imp.fit(hvac_numeric.loc[:10])

In [9]:
transformed = imp.fit_transform(hvac_numeric)

In [10]:
transformed

array([[66.        , 20.        ],
       [67.50773481, 20.        ],
       [70.        , 20.        ],
       ...,
       [67.50773481,  4.        ],
       [65.        , 23.        ],
       [66.        , 21.        ]])

In [11]:
hvac['TargetTemp'], hvac['SystemAge'] = transformed[:,0], transformed[:,1]

In [12]:
hvac.head()

Unnamed: 0,Date,Time,TargetTemp,ActualTemp,System,SystemAge,BuildingID,10
0,6/1/13,0:00:01,66.0,58,13,20.0,4,
1,6/2/13,1:00:01,67.507735,68,3,20.0,17,
2,6/3/13,2:00:01,70.0,73,17,20.0,18,
3,6/4/13,3:00:01,67.0,63,2,15.386643,15,
4,6/5/13,4:00:01,68.0,74,16,9.0,3,


## Temperaturwerte skalieren

In [13]:
hvac['ScaledTemp'] = preprocessing.scale(hvac['ActualTemp'])

In [14]:
hvac['ScaledTemp'].head()

0   -1.293272
1    0.048732
2    0.719733
3   -0.622270
4    0.853934
Name: ScaledTemp, dtype: float64

## Skalieren mit dem `MinMaxScaler`

In [15]:
min_max_scaler = preprocessing.MinMaxScaler()

In [16]:
temp_minmax = min_max_scaler.fit_transform(hvac[['ActualTemp']])

In [17]:
temp_minmax

array([[0.12],
       [0.52],
       [0.72],
       ...,
       [0.56],
       [0.32],
       [0.44]])