<a href="https://colab.research.google.com/github/tigureis/Notes_and_exercises_on_Data_Structuring/blob/main/Encoding_and_Filters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import plotly.express as px


In [2]:
sns.set_theme(
    context='talk',
    style='ticks',
    font_scale=.8,
    rc={
        'figure.figsize': (12,8)
    }
)

Load built-in example `'tips'` from `seaborn`

In [3]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
pd.get_dummies(df)

Unnamed: 0,total_bill,tip,size,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,16.99,1.01,2,False,True,False,True,False,False,False,True,False,True
1,10.34,1.66,3,True,False,False,True,False,False,False,True,False,True
2,21.01,3.50,3,True,False,False,True,False,False,False,True,False,True
3,23.68,3.31,2,True,False,False,True,False,False,False,True,False,True
4,24.59,3.61,4,False,True,False,True,False,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,True,False,False,True,False,False,True,False,False,True
240,27.18,2.00,2,False,True,True,False,False,False,True,False,False,True
241,22.67,2.00,2,True,False,True,False,False,False,True,False,False,True
242,17.82,1.75,2,True,False,False,True,False,False,True,False,False,True


# [OneHotEncoder](https://scikit-learn.org/dev/modules/generated/sklearn.preprocessing.OneHotEncoder.html)

---



creating and configuring a `OneHotEncoder `

In [5]:
encoder = OneHotEncoder(drop='if_binary')
encoder

Selecting Categorical Columns

In [6]:
categorical_columns = df.select_dtypes(exclude='number').columns
encoder.fit(df[categorical_columns])

Access the categories for each feature that the encoder was fitted on.

In [7]:
# This will be a list of arrays, one for each categorical feature.
encoder.categories_

[array(['Female', 'Male'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['Fri', 'Sat', 'Sun', 'Thur'], dtype=object),
 array(['Dinner', 'Lunch'], dtype=object)]

Access the index of the dropped category for each feature (if any).

In [8]:
encoder.drop_idx_

array([0, 0, None, 0], dtype=object)

 Access the names of the features the encoder was fitted on.

In [9]:
encoder.feature_names_in_


array(['sex', 'smoker', 'day', 'time'], dtype=object)


Get the output feature names after one-hot encoding.


In [10]:
# These names reflect the original feature and the encoded category.
encoder.get_feature_names_out()

array(['sex_Male', 'smoker_Yes', 'day_Fri', 'day_Sat', 'day_Sun',
       'day_Thur', 'time_Lunch'], dtype=object)

Creating a new DataFrame with encoded data

In [11]:
pd.DataFrame(
             encoder.transform(df[categorical_columns])     # Transform the categorical columns of the DataFrame 'df'
             .toarray(),                                    # convert the sparse matrix output to a dense NumPy array
             columns = encoder.get_feature_names_out())     # Set the column names of the DataFrame to the feature names generated by the encoder.

Unnamed: 0,sex_Male,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Lunch
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...
239,1.0,0.0,0.0,1.0,0.0,0.0,0.0
240,0.0,1.0,0.0,1.0,0.0,0.0,0.0
241,1.0,1.0,0.0,1.0,0.0,0.0,0.0
242,1.0,0.0,0.0,1.0,0.0,0.0,0.0


# [LableEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html) / [OrdinalEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html)

# Using `Pandas`

In [12]:
df_cat= df.copy()

df_cat[categorical_columns] = df_cat[categorical_columns].astype('category')
df_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [13]:
df_cat['smoker'].cat.codes

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
...,...
239,1
240,0
241,0
242,1


In [14]:
df_cat['smoker'].cat.categories

Index(['Yes', 'No'], dtype='object')

Getting categories

In [15]:
{col: dict(enumerate(df_cat[col].cat.categories)) for col in categorical_columns}

{'sex': {0: 'Male', 1: 'Female'},
 'smoker': {0: 'Yes', 1: 'No'},
 'day': {0: 'Thur', 1: 'Fri', 2: 'Sat', 3: 'Sun'},
 'time': {0: 'Lunch', 1: 'Dinner'}}

Modifying columns

In [16]:
df_cat[categorical_columns]=df_cat[categorical_columns].apply(lambda x: x.cat.codes)

In [17]:
df_cat.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,1,1,3,1,2
1,10.34,1.66,0,1,3,1,3
2,21.01,3.5,0,1,3,1,3
3,23.68,3.31,0,1,3,1,2
4,24.59,3.61,1,1,3,1,4


Using [`Factorize`](https://pandas.pydata.org/docs/reference/api/pandas.factorize.html)

In [18]:
df['sex'].factorize()

(array([0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,
        0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
        1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
        1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
        1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
        1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0,
        0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0,
        1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
        1, 0]),
 CategoricalIndex(['Female', 'Male'], categories=['Male', 'Female'], ordered=False, dtype='category'))

Using `OrdinalEncoder`

In [21]:
encoder=OrdinalEncoder().fit(df[categorical_columns])

Access the categories for each feature that the encoder was fitted on.

In [22]:
encoder.categories_

[array(['Female', 'Male'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['Fri', 'Sat', 'Sun', 'Thur'], dtype=object),
 array(['Dinner', 'Lunch'], dtype=object)]

Transforming expected data

In [25]:
df_cat[categorical_columns]=encoder.transform(df[categorical_columns])
df_cat.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0.0,0.0,2.0,0.0,2
1,10.34,1.66,1.0,0.0,2.0,0.0,3
2,21.01,3.5,1.0,0.0,2.0,0.0,3
3,23.68,3.31,1.0,0.0,2.0,0.0,2
4,24.59,3.61,0.0,0.0,2.0,0.0,4


# Filters

---



In [27]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


Standard Filter

In [28]:
df[df.tip > 4].head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
5,25.29,4.71,Male,No,Sun,Dinner,4
11,35.26,5.0,Female,No,Sun,Dinner,4
20,17.92,4.08,Male,No,Sat,Dinner,2
23,39.42,7.58,Male,No,Sat,Dinner,4
28,21.7,4.3,Male,No,Sat,Dinner,2


 Filter with Query

In [None]:
df.query('tip > 4').head()

Filter with Query and Variable

In [None]:
minimo_aceitavel = 3
df.query('tip>@minimo_aceitavel')

 Multiple Conventional Filters

In [29]:
# Observe that it's required to place the conditions inside parentheses
df[ (df.tip>2) & (df.total_bill<20)].head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
9,14.78,3.23,Male,No,Sun,Dinner,2
13,18.43,3.0,Male,No,Sun,Dinner,4
14,14.83,3.02,Female,No,Sun,Dinner,2
17,16.29,3.71,Male,No,Sun,Dinner,3
18,16.97,3.5,Female,No,Sun,Dinner,3


Multiple Filters with Query

In [52]:
df.query('tip>2 & total_bill<20').head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
9,14.78,3.23,Male,No,Sun,Dinner,2
13,18.43,3.0,Male,No,Sun,Dinner,4
14,14.83,3.02,Female,No,Sun,Dinner,2
17,16.29,3.71,Male,No,Sun,Dinner,3
18,16.97,3.5,Female,No,Sun,Dinner,3


Filter with Index

In [47]:
df_index=df.set_index('sex')
df_index

Unnamed: 0_level_0,total_bill,tip,smoker,day,time,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,16.99,1.01,No,Sun,Dinner,2
Male,10.34,1.66,No,Sun,Dinner,3
Male,21.01,3.50,No,Sun,Dinner,3
Male,23.68,3.31,No,Sun,Dinner,2
Female,24.59,3.61,No,Sun,Dinner,4
...,...,...,...,...,...,...
Male,29.03,5.92,No,Sat,Dinner,3
Female,27.18,2.00,Yes,Sat,Dinner,2
Male,22.67,2.00,Yes,Sat,Dinner,2
Male,17.82,1.75,No,Sat,Dinner,2


In [48]:
df_index.iloc[1:5]

Unnamed: 0_level_0,total_bill,tip,smoker,day,time,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Male,10.34,1.66,No,Sun,Dinner,3
Male,21.01,3.5,No,Sun,Dinner,3
Male,23.68,3.31,No,Sun,Dinner,2
Female,24.59,3.61,No,Sun,Dinner,4


In [49]:
df_index.loc['Female']

Unnamed: 0_level_0,total_bill,tip,smoker,day,time,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,16.99,1.01,No,Sun,Dinner,2
Female,24.59,3.61,No,Sun,Dinner,4
Female,35.26,5.00,No,Sun,Dinner,4
Female,14.83,3.02,No,Sun,Dinner,2
Female,10.33,1.67,No,Sun,Dinner,3
...,...,...,...,...,...,...
Female,10.09,2.00,Yes,Fri,Lunch,2
Female,22.12,2.88,Yes,Sat,Dinner,2
Female,35.83,4.67,No,Sat,Dinner,3
Female,27.18,2.00,Yes,Sat,Dinner,2


Filtering with Lists

In [51]:
  df.day.unique()

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

In [54]:
df[df.day.isin(['Sat','Sun'])]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
238,35.83,4.67,Female,No,Sat,Dinner,3
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2


Filter by date

In [71]:
# Loading aircrash dataset
fatalities_dataset_path = 'https://query.data.world/s/2gfb7bmzhna6kcbpc7admwa6cexprz'
df_datetime = pd.read_csv(fatalities_dataset_path, usecols=['Date','Fatalities'], parse_dates=['Date'], index_col='Date').sort_index()
df_datetime.head()

Unnamed: 0_level_0,Fatalities
Date,Unnamed: 1_level_1
1908-09-17,1.0
1912-07-12,5.0
1913-08-06,1.0
1913-09-09,14.0
1913-10-17,30.0


Using `.loc`

In [78]:
df_datetime.loc['2000-05' : '2010-06-20']

Unnamed: 0_level_0,Fatalities
Date,Unnamed: 1_level_1
2000-05-02,2.0
2000-05-03,1.0
2000-05-10,6.0
2000-05-17,3.0
2000-05-21,19.0
...,...
2009-05-20,98.0
2009-05-26,4.0
2009-06-01,228.0
2009-06-07,1.0


In [79]:
df=df_datetime.reset_index()
df.head()

Unnamed: 0,Date,Fatalities
0,1908-09-17,1.0
1,1912-07-12,5.0
2,1913-08-06,1.0
3,1913-09-09,14.0
4,1913-10-17,30.0


using `betwen`

In [80]:
df[df.Date.between('2000','2010-05-10')]

Unnamed: 0,Date,Fatalities
4685,2000-01-05,1.0
4686,2000-01-10,10.0
4687,2000-01-13,22.0
4688,2000-01-15,5.0
4689,2000-01-25,4.0
...,...,...
5263,2009-05-20,98.0
5264,2009-05-26,4.0
5265,2009-06-01,228.0
5266,2009-06-07,1.0


using -gt/lt

In [81]:
df[(df.Date>='2000-05-10') & (df.Date.le('2010-05-10'))]

Unnamed: 0,Date,Fatalities
4710,2000-05-10,6.0
4711,2000-05-17,3.0
4712,2000-05-21,19.0
4713,2000-05-25,1.0
4714,2000-05-25,1.0
...,...,...
5263,2009-05-20,98.0
5264,2009-05-26,4.0
5265,2009-06-01,228.0
5266,2009-06-07,1.0
