In [2]:
import pandas as pd
import numpy as np
import sklearn 
import warnings
warnings.filterwarnings('ignore')

Jupyter notebook inspirowany: https://towardsdatascience.com/feature-engineering-for-machine-learning-3a5e293a5114

# List of Techniques
- Imputation
- Handling Outliers
- Binning
- Log Transform
- One-Hot Encoding
- Grouping Operations
- Feature Split
- Scaling
- Extracting Date

In [3]:
np.random.seed(123)
fake_data=pd.DataFrame({'num':np.random.choice([None, 3,4], 100), 
                        'cat': np.random.choice([None, 'Puma','Nike','Adidas'], 100, p=[0.92, 0.03, 0.03, 0.02])})

# Braki danych

## Może usuwanie kolumn, rekordów?

In [4]:
na_ratio_columns=fake_data.isnull().mean() 
na_ratio_columns

cat    0.93
num    0.30
dtype: float64

In [5]:
data_dropped_columns=fake_data[fake_data.columns[na_ratio_columns < 0.9]]  #usuwamy kolumny gdzie braków więcej niż 90%
data_dropped_columns.shape

(100, 1)

In [6]:
data_drop_records=fake_data.loc[fake_data.isnull().mean(axis=1) < 0.9]

In [7]:
data_drop_records.shape

(73, 2)

# Imputation

## NA in numerical variable

In [8]:
fake_data.num.value_counts()  #uwaga nie pokazuje braków!

4    38
3    32
Name: num, dtype: int64

In [9]:
pd.DataFrame(fake_data.num).info() #tutaj widać ile braków

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 1 columns):
num    70 non-null object
dtypes: object(1)
memory usage: 872.0+ bytes


In [10]:
num_with_0=fake_data.num.fillna(0) #kiedy ma to sens?
num_with_0.value_counts()

4    38
3    32
0    30
Name: num, dtype: int64

In [11]:
fake_data.num.fillna(fake_data.num.median(), inplace=True) #fake_data.num.mean()
fake_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
cat    7 non-null object
num    100 non-null float64
dtypes: float64(1), object(1)
memory usage: 1.6+ KB


### - kiedy uzupełniamy średnią a kiedy medianą?

## NA in categorical variable

In [12]:
# uzupełnianie w miejscu (wartością najbardziej popularną) - rzadko stosowane podejście
fake_data['cat'].fillna(fake_data['cat'].value_counts().idxmax(), inplace=True)
fake_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
cat    100 non-null object
num    100 non-null float64
dtypes: float64(1), object(1)
memory usage: 1.6+ KB


In [13]:
# lepsze podejście
fake_data['cat'].fillna('Other', inplace=True)
fake_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
cat    100 non-null object
num    100 non-null float64
dtypes: float64(1), object(1)
memory usage: 1.6+ KB


## Czy zaprezentowane sposoby uzupełniania Nan są dobre?

- Wnoszą nową informację (niekoniecznie zgodną z prawdą)

Pomysł - zróbmy dodatkową kolumnę gdzie 1 gdy w danym rekordzie był Nan, 0 gdy go nie było

In [14]:
fake_data["num"+"_missing"]=fake_data.num.isnull()*1 #najpierw tworzymy nową kolumnę a potem imputation

# Outliers

### najłatwiej wizualnie dokonać detekcji outliers poprzez boxplot

In [15]:
#dane jedynie z przedziału (średnia +- 3 sigma) - ma to sens gdy rozkład normalny
data=pd.DataFrame({'num':np.random.normal(2,0.4,1000)})
factor = 3
upper_lim = data['num'].mean () + data['num'].std () * factor
lower_lim = data['num'].mean () - data['num'].std () * factor

data = data[(data['num'] < upper_lim) & (data['num'] > lower_lim)]
data.shape

(996, 1)

In [16]:
#usuwanie na podstawie skrajnych percentyli
fake_data_numeric=pd.DataFrame({'num':np.random.exponential(size=100)})
print(fake_data_numeric)
upper_lim = fake_data_numeric['num'].quantile(.95)
lower_lim = fake_data_numeric['num'].quantile(.05)

data_percentile = fake_data_numeric[(fake_data_numeric['num'] < upper_lim) & (fake_data_numeric['num'] > lower_lim)]
data_percentile.shape

         num
0   0.215517
1   0.209738
2   0.430976
3   0.955282
4   0.781793
5   0.748734
6   2.032030
7   1.975092
8   1.865612
9   1.212179
10  0.498321
11  0.313813
12  3.116604
13  0.046342
14  0.963338
15  1.871609
16  0.317335
17  3.851214
18  2.170890
19  0.222304
20  0.315135
21  2.275971
22  0.115693
23  0.208843
24  0.937896
25  0.159013
26  0.849907
27  0.856950
28  0.907218
29  0.288618
..       ...
70  0.243816
71  0.434732
72  0.147217
73  1.335234
74  2.971192
75  1.327528
76  0.133164
77  1.182098
78  0.567793
79  0.096637
80  7.550539
81  1.287808
82  0.974020
83  2.064339
84  0.393548
85  1.099148
86  0.631639
87  0.132332
88  0.778857
89  0.999426
90  0.641029
91  0.133842
92  0.078268
93  1.093143
94  1.819592
95  2.578260
96  0.491327
97  2.785728
98  1.713707
99  0.551720

[100 rows x 1 columns]


(90, 1)

In [17]:
# przycinanie wartości skrajnych
fake_data_numeric.loc[(fake_data_numeric['num']> upper_lim),'num'] = upper_lim
fake_data_numeric.loc[(fake_data_numeric['num'] < lower_lim),'num'] = lower_lim

In [18]:
fake_data_numeric['num'].value_counts()

0.097560    5
2.978463    5
0.216693    1
0.748734    1
0.159013    1
2.006141    1
0.551720    1
0.393548    1
1.819592    1
0.313813    1
2.025552    1
0.133164    1
1.327528    1
0.907218    1
0.888501    1
0.567793    1
0.243816    1
0.325572    1
2.578260    1
1.010703    1
0.346798    1
0.132332    1
0.417280    1
0.203310    1
0.974020    1
2.785728    1
1.539182    1
2.275971    1
1.305963    1
0.434732    1
           ..
1.209614    1
0.429134    1
0.340306    1
0.209738    1
1.327787    1
0.115693    1
0.147217    1
1.099148    1
0.138898    1
0.097608    1
0.498321    1
1.048824    1
0.955282    1
0.231010    1
0.461911    1
1.569494    1
0.317335    1
0.215517    1
1.961683    1
0.778857    1
1.165415    1
1.713707    1
0.357963    1
2.170890    1
0.648028    1
0.133842    1
0.641029    1
0.200916    1
1.093143    1
1.871609    1
Name: num, Length: 92, dtype: int64

# Binning

In [19]:
data=np.random.choice(np.linspace(0,100), 100)

In [20]:
data = pd.cut(data, bins=[0,30,70,100], labels=["Young", "Mid", "Old"])

In [21]:
data

[Mid, Mid, Mid, Old, Mid, ..., Mid, NaN, Young, Young, Mid]
Length: 100
Categories (3, object): [Young < Mid < Old]

In [22]:
geo=np.random.choice(("Poland",'Chile', 'France', 'Spain'), 100)

In [23]:
geo=pd.Series(geo)
geo

0      Chile
1     France
2     Poland
3     France
4      Spain
5     France
6      Chile
7      Chile
8     France
9     Poland
10    Poland
11    Poland
12     Chile
13    France
14     Spain
15     Spain
16     Spain
17     Chile
18     Spain
19     Spain
20    Poland
21    Poland
22     Spain
23    Poland
24    France
25    France
26    Poland
27    Poland
28    France
29     Chile
       ...  
70     Spain
71    France
72     Spain
73    France
74    France
75    France
76    Poland
77    France
78     Spain
79    Poland
80     Chile
81    Poland
82    France
83     Chile
84    France
85     Chile
86    Poland
87     Spain
88    Poland
89     Spain
90    France
91    France
92    Poland
93    France
94     Spain
95    France
96     Spain
97     Chile
98    France
99     Spain
Length: 100, dtype: object

In [24]:
#metoda z użyciem dict/defaultdict
dict_geo={'Poland': "Europe", "Chile":"South America", "France":"Europe"}
from collections import defaultdict
countries_list = [('Poland','Europe'), ('France','Europe'), ('Chile','South America')]

countries_dict = defaultdict(lambda:'Other')
for continent, country in countries_list:
     countries_dict[continent]=country

In [25]:
geo.map(countries_dict)

0     South America
1            Europe
2            Europe
3            Europe
4             Other
5            Europe
6     South America
7     South America
8            Europe
9            Europe
10           Europe
11           Europe
12    South America
13           Europe
14            Other
15            Other
16            Other
17    South America
18            Other
19            Other
20           Europe
21           Europe
22            Other
23           Europe
24           Europe
25           Europe
26           Europe
27           Europe
28           Europe
29    South America
          ...      
70            Other
71           Europe
72            Other
73           Europe
74           Europe
75           Europe
76           Europe
77           Europe
78            Other
79           Europe
80    South America
81           Europe
82           Europe
83    South America
84           Europe
85    South America
86           Europe
87            Other
88           Europe


In [26]:
geo.map(dict_geo)
#map jest szybsze niż replace

0     South America
1            Europe
2            Europe
3            Europe
4               NaN
5            Europe
6     South America
7     South America
8            Europe
9            Europe
10           Europe
11           Europe
12    South America
13           Europe
14              NaN
15              NaN
16              NaN
17    South America
18              NaN
19              NaN
20           Europe
21           Europe
22              NaN
23           Europe
24           Europe
25           Europe
26           Europe
27           Europe
28           Europe
29    South America
          ...      
70              NaN
71           Europe
72              NaN
73           Europe
74           Europe
75           Europe
76           Europe
77           Europe
78              NaN
79           Europe
80    South America
81           Europe
82           Europe
83    South America
84           Europe
85    South America
86           Europe
87              NaN
88           Europe


# Log transform

In [27]:
data_tail=pd.read_csv('./tail.csv')
data=data_tail.copy()

In [28]:
data.plot(bins=50, kind='hist', title='rozkład ceny produktu')

<matplotlib.axes._subplots.AxesSubplot at 0x7febe8160b90>

Error in callback <function post_execute at 0x7febe8165050> (for post_execute):


ValueError: matplotlib display text must have all code points < 128 or use Unicode strings

In [29]:
import matplotlib.pyplot as plt
price_after_log=np.log1p(data.values)
plt.hist(price_after_log, bins=50)
plt.title('rozkład logarytmu ceny')
plt.show()
# niektóre funkcje wymagają numpy array inne pd.DataFrame

ValueError: matplotlib display text must have all code points < 128 or use Unicode strings

<Figure size 432x288 with 1 Axes>

In [30]:
inv_transform=np.expm1(price_after_log)
plt.hist(inv_transform, bins=50)
plt.title('rozkład ceny')
plt.show()

ValueError: matplotlib display text must have all code points < 128 or use Unicode strings

<Figure size 432x288 with 1 Axes>

# Categorical variables encoding

In [35]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']
values = np.array(data)

# integer encode
le = LabelEncoder()
integer_encoded = le.fit_transform(values)
print(integer_encoded)
#invert
print(le.inverse_transform(integer_encoded))

# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

# invert 
inverted = le.inverse_transform([np.argmax(onehot_encoded[0, :])])
print(inverted)

[0 0 2 0 1 1 2 0 2 1]
['cold' 'cold' 'warm' 'cold' 'hot' 'hot' 'warm' 'cold' 'warm' 'hot']
[[1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]
['cold']


In [34]:
#inny sposób
df=pd.DataFrame({'weather':data})
df['weather'].factorize()

(array([0, 0, 1, 0, 2, 2, 1, 0, 1, 2]),
 Index([u'cold', u'warm', u'hot'], dtype='object'))

## Target encoding
https://medium.com/@venkatasai.katuru/target-encoding-done-the-right-way-b6391e66c19f

In [73]:
! pip install category_encoders



In [74]:
import category_encoders
y=np.random.choice([0,1], 10)
te=category_encoders.target_encoder.TargetEncoder(data)
encoded=te.fit_transform(data,y ) #nie fit a fit_transform
encoded

Unnamed: 0,0
0,0.742886
1,0.742886
2,0.65872
3,0.742886
4,0.365121
5,0.365121
6,0.65872
7,0.742886
8,0.65872
9,0.365121


In [57]:
te.transform(['cold'])

Unnamed: 0,0
0,0.009485


# Scaling
- normalization
- standarization

In [58]:
from sklearn import preprocessing
data = data_tail.copy()
normalized = preprocessing.normalize(data)

In [59]:
from sklearn import preprocessing
names = data.columns
scaler = preprocessing.StandardScaler()

scaled_df = scaler.fit_transform(data)
scaled_df = pd.DataFrame(scaled_df, columns=names)

# Extracting info from date

In [78]:
from datetime import date

data = pd.DataFrame({'date':
['01-01-2017',
'04-12-2008',
'23-06-1988',
'25-08-1999',
'20-02-1993',
]})

#Transform string to date
data['date'] = pd.to_datetime(data.date, format="%d-%m-%Y")

#Extracting Year
data['year'] = data['date'].dt.year

#Extracting Month
data['month'] = data['date'].dt.month

#Extracting passed years since the date
data['passed_years'] = date.today().year - data['date'].dt.year

#Extracting passed months since the date
data['passed_months'] = (date.today().year - data['date'].dt.year) * 12 + date.today().month - data['date'].dt.month

#Extracting the weekday name of the date
data['day_name'] = data['date'].dt.day_name()

In [79]:
data

Unnamed: 0,date,year,month,passed_years,passed_months,day_name
0,2017-01-01,2017,1,3,38,Sunday
1,2008-12-04,2008,12,12,135,Thursday
2,1988-06-23,1988,6,32,381,Thursday
3,1999-08-25,1999,8,21,247,Wednesday
4,1993-02-20,1993,2,27,325,Saturday


### z numeru PESEL można wywnioskować o wieku osoby

# Przydatne info - automatyzacja

In [80]:
#choosing categorical data
obj_df = data.select_dtypes(include=['object'])
obj_df

Unnamed: 0,day_name
0,Sunday
1,Thursday
2,Thursday
3,Wednesday
4,Saturday


In [81]:
## convert dtypes into categorical type 
# obj_df["day_name"] = obj_df["day_name"].astype('category')

In [82]:
categorical_cols=obj_df.select_dtypes(include=['object']).columns
print(categorical_cols)

obj_df[categorical_cols] = obj_df[categorical_cols].apply(lambda col: le.fit_transform(col))
obj_df[categorical_cols].head()

Index(['day_name'], dtype='object')


Unnamed: 0,day_name
0,1
1,2
2,2
3,3
4,0


# warto poczytać
pakiet category_encoders:
- https://kiwidamien.github.io/encoding-categorical-variables.html
- https://pbpython.com/categorical-encoding.html