## Libreta Sesión 15 - Feature Engineering

In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/xaramillo/ml-bootcamp-labs/refs/heads/main/data/raw/synthetic_petroleum_dataset.csv')

In [5]:
# Dataframe para esta sesión
df

Unnamed: 0,WellID,Time,Pressure,BottomHolePressure,Permeability,Porosity,ReservoirThickness,FlowRate,Viscosity,Density,OilProduction
0,Well_51,38,4743.820615,1822.167535,323.733974,23.831385,35.111987,120.713322,1.426282,823.699096,615.960560
1,Well_92,38,1658.964153,1998.111224,520.998050,19.530632,37.676133,2626.818320,1.549382,829.638798,1080.784762
2,Well_14,53,1168.601668,991.806085,689.692681,14.369382,48.189319,3016.739176,3.782663,830.167337,206.141281
3,Well_71,29,2816.221290,1143.541140,176.392416,26.068552,43.247330,4807.164914,4.636237,710.494304,213.842095
4,Well_60,41,4204.839601,1116.763414,280.885748,29.968939,41.992313,3287.362102,4.556048,924.836238,854.618795
...,...,...,...,...,...,...,...,...,...,...,...
9995,Well_15,13,2414.421105,1203.321826,48.048812,11.693901,38.739181,1051.438045,3.980325,689.673081,2359.433840
9996,Well_16,37,1254.605481,1947.022718,890.807407,20.672304,36.243575,3301.766740,3.009432,868.842501,2874.396629
9997,Well_79,47,2183.932855,2054.700126,798.767009,24.705318,41.639760,789.730872,3.097366,955.371978,1255.942879
9998,Well_18,26,2286.946065,1502.365694,830.715997,11.728822,39.045014,3983.360502,1.058778,608.284971,1579.148432


In [6]:
### Parte 1: Feature Engineering - Transformaciones no lineales

import numpy as np

In [24]:
# Función nombrada
def log_base(n,base=10):
    """Devuelve el logaritmo de un número en la base especificada"""
    return np.log(n) / np.log(base)

In [25]:
# Aplicar la función log base a nuestro dataframe:
# Opción 1. Funciones anónimas para feature engineering
 
df['Pressure'].apply( lambda x : log_base(x) )


0       3.676128
1       3.219837
2       3.067667
3       3.449667
4       3.623749
          ...   
9995    3.382813
9996    3.098507
9997    3.339239
9998    3.359256
9999    3.490377
Name: Pressure, Length: 10000, dtype: float64

In [None]:
# Opción 2: Función nombrada

def feature_engineering_presion():
    presiones = df['Pressure'].to_list()
    res = []
    for i in presiones:
        res.append( np.log(i))
    return res

feature_engineering_presion()

In [None]:
# Opción 3: Compatibilidad directa entre numpy y una serie de pandas
np.log(df.Pressure) / np.log(10)

0       5.259351
1       4.606545
2       4.388839
3       4.935357
4       5.184413
          ...   
9995    4.839711
9996    4.432962
9997    4.777371
9998    4.806009
9999    4.993600
Name: Pressure, Length: 10000, dtype: float64

In [26]:
df.columns

Index(['WellID', 'Time', 'Pressure', 'BottomHolePressure', 'Permeability',
       'Porosity', 'ReservoirThickness', 'FlowRate', 'Viscosity', 'Density',
       'OilProduction'],
      dtype='object')

In [28]:
# Feature engineering con funciones no-lineales (univariable)

df['log_pressure'] = df['Pressure'].apply( lambda x : np.log(x))
df['sqrt_permeability'] = df['Permeability'].apply(lambda x: np.sqrt(x) )
df['exp_porosity'] = df['Porosity'].apply(lambda x: np.exp(x))


In [29]:
df.head()

Unnamed: 0,WellID,Time,Pressure,BottomHolePressure,Permeability,Porosity,ReservoirThickness,FlowRate,Viscosity,Density,OilProduction,log_pressure,sqrt_permeability,exp_porosity
0,Well_51,38,4743.820615,1822.167535,323.733974,23.831385,35.111987,120.713322,1.426282,823.699096,615.96056,8.464598,17.992609,22378910000.0
1,Well_92,38,1658.964153,1998.111224,520.99805,19.530632,37.676133,2626.81832,1.549382,829.638798,1080.784762,7.413949,22.825382,303420900.0
2,Well_14,53,1168.601668,991.806085,689.692681,14.369382,48.189319,3016.739176,3.782663,830.167337,206.141281,7.063563,26.262001,1739977.0
3,Well_71,29,2816.22129,1143.54114,176.392416,26.068552,43.24733,4807.164914,4.636237,710.494304,213.842095,7.943151,13.281281,209617900000.0
4,Well_60,41,4204.839601,1116.763414,280.885748,29.968939,41.992313,3287.362102,4.556048,924.836238,854.618795,8.343991,16.759646,10359640000000.0


![imagen.png](attachment:imagen.png)

In [31]:
# Multivariable feature engineering
df['Productivity_Index'] = df['FlowRate'] / (df['Pressure'] - df['BottomHolePressure'] )

In [35]:
df[['FlowRate','Pressure','BottomHolePressure','Productivity_Index']].sample(10)

Unnamed: 0,FlowRate,Pressure,BottomHolePressure,Productivity_Index
1174,1664.965004,1068.691963,868.377051,8.311738
1215,2067.984678,4814.288895,1909.279639,0.711869
300,1648.585366,4956.297862,2424.52333,0.651158
7374,4940.67866,2347.93249,2100.800043,19.992027
4439,3653.482552,4823.246973,1043.881606,0.966692
8289,366.041715,2323.904278,891.378347,0.255522
845,3399.545042,3331.477445,1065.181665,1.500045
9642,1169.477728,2708.624165,1136.548051,0.743907
2834,3254.410026,4156.019538,2458.138807,1.916748
8183,2690.997051,2657.90691,1370.952185,2.09098


In [36]:
df['Reynolds_Index'] = df['FlowRate'] * df['Density'] / df['Viscosity']

In [37]:
df['Pososity_Thicknes_Index'] = df['Porosity'] * df['ReservoirThickness']

In [38]:
df.head()

Unnamed: 0,WellID,Time,Pressure,BottomHolePressure,Permeability,Porosity,ReservoirThickness,FlowRate,Viscosity,Density,OilProduction,log_pressure,sqrt_permeability,exp_porosity,Productivity_Index,Reynolds_Index,Pososity_Thicknes_Index
0,Well_51,38,4743.820615,1822.167535,323.733974,23.831385,35.111987,120.713322,1.426282,823.699096,615.96056,8.464598,17.992609,22378910000.0,0.041317,69713.73,836.767269
1,Well_92,38,1658.964153,1998.111224,520.99805,19.530632,37.676133,2626.81832,1.549382,829.638798,1080.784762,7.413949,22.825382,303420900.0,-7.745366,1406567.0,735.838666
2,Well_14,53,1168.601668,991.806085,689.692681,14.369382,48.189319,3016.739176,3.782663,830.167337,206.141281,7.063563,26.262001,1739977.0,17.063431,662072.8,692.450751
3,Well_71,29,2816.22129,1143.54114,176.392416,26.068552,43.24733,4807.164914,4.636237,710.494304,213.842095,7.943151,13.281281,209617900000.0,2.87393,736688.7,1127.395284
4,Well_60,41,4204.839601,1116.763414,280.885748,29.968939,41.992313,3287.362102,4.556048,924.836238,854.618795,8.343991,16.759646,10359640000000.0,1.064534,667304.5,1258.465059


In [43]:
df.shape

(10000, 17)

In [39]:
# Feature Engineering con métodos empíricos (Relación de Variables en un polinomio)

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=3)

In [40]:
features = poly.fit_transform(df[['FlowRate','Density','Viscosity']])

In [51]:
reynolds_poly_df = pd.DataFrame(features)
reynolds_poly_df.columns = [ ''.join(['Reynolds_Poly_',str(i)]) for i in reynolds_poly_df.columns ]

In [52]:
reynolds_poly_df.head()

Unnamed: 0,Reynolds_Poly_0,Reynolds_Poly_1,Reynolds_Poly_2,Reynolds_Poly_3,Reynolds_Poly_4,Reynolds_Poly_5,Reynolds_Poly_6,Reynolds_Poly_7,Reynolds_Poly_8,Reynolds_Poly_9,Reynolds_Poly_10,Reynolds_Poly_11,Reynolds_Poly_12,Reynolds_Poly_13,Reynolds_Poly_14,Reynolds_Poly_15,Reynolds_Poly_16,Reynolds_Poly_17,Reynolds_Poly_18,Reynolds_Poly_19
0,1.0,120.713322,823.699096,1.426282,14571.71,99431.45,172.171265,678480.200002,1174.827373,2.034281,1758999.0,12002700.0,20783.37,81901600.0,141817.3,245.564813,558863500.0,967704.2,1675.63539,2.901459
1,1.0,2626.81832,829.638798,1.549382,6900174.0,2179310.0,4069.945607,688300.534506,1285.427604,2.400585,18125500000.0,5724652000.0,10691010.0,1808040000.0,3376585.0,6305.90137,571040800.0,1066441.0,1991.618679,3.719424
2,1.0,3016.739176,830.167337,3.782663,9100715.0,2504398.0,11411.307234,689177.806814,3140.24315,14.308538,27454480000.0,7555117000.0,34424940.0,2079070000.0,9473295.0,43165.128037,572132900.0,2606927.0,11878.48113,54.124376
3,1.0,4807.164914,710.494304,4.636237,23108830.0,3415463.0,22287.153763,504802.15655,3294.019675,21.49469,111088000000.0,16418700000.0,107138000.0,2426667000.0,15834900.0,103328.517267,358659100.0,2340382.0,15271.854473,99.654466
4,1.0,3287.362102,924.836238,4.556048,10806750.0,3040272.0,14977.379752,855322.067085,4213.598355,20.757574,35525700000.0,9994474000.0,49236070.0,2811753000.0,13851620.0,68237.662081,791032800.0,3896888.0,19197.356646,94.572505


In [53]:
pd.concat([df,reynolds_poly_df],axis=1)

Unnamed: 0,WellID,Time,Pressure,BottomHolePressure,Permeability,Porosity,ReservoirThickness,FlowRate,Viscosity,Density,...,Reynolds_Poly_10,Reynolds_Poly_11,Reynolds_Poly_12,Reynolds_Poly_13,Reynolds_Poly_14,Reynolds_Poly_15,Reynolds_Poly_16,Reynolds_Poly_17,Reynolds_Poly_18,Reynolds_Poly_19
0,Well_51,38,4743.820615,1822.167535,323.733974,23.831385,35.111987,120.713322,1.426282,823.699096,...,1.758999e+06,1.200270e+07,2.078337e+04,8.190160e+07,1.418173e+05,245.564813,5.588635e+08,9.677042e+05,1675.635390,2.901459
1,Well_92,38,1658.964153,1998.111224,520.998050,19.530632,37.676133,2626.818320,1.549382,829.638798,...,1.812550e+10,5.724652e+09,1.069101e+07,1.808040e+09,3.376585e+06,6305.901370,5.710408e+08,1.066441e+06,1991.618679,3.719424
2,Well_14,53,1168.601668,991.806085,689.692681,14.369382,48.189319,3016.739176,3.782663,830.167337,...,2.745448e+10,7.555117e+09,3.442494e+07,2.079070e+09,9.473295e+06,43165.128037,5.721329e+08,2.606927e+06,11878.481130,54.124376
3,Well_71,29,2816.221290,1143.541140,176.392416,26.068552,43.247330,4807.164914,4.636237,710.494304,...,1.110880e+11,1.641870e+10,1.071380e+08,2.426667e+09,1.583490e+07,103328.517267,3.586591e+08,2.340382e+06,15271.854473,99.654466
4,Well_60,41,4204.839601,1116.763414,280.885748,29.968939,41.992313,3287.362102,4.556048,924.836238,...,3.552570e+10,9.994474e+09,4.923607e+07,2.811753e+09,1.385162e+07,68237.662081,7.910328e+08,3.896888e+06,19197.356646,94.572505
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,Well_15,13,2414.421105,1203.321826,48.048812,11.693901,38.739181,1051.438045,3.980325,689.673081,...,1.162388e+09,7.624487e+08,4.400337e+06,5.001154e+08,2.886327e+06,16657.920546,3.280423e+08,1.893238e+06,10926.482486,63.060244
9996,Well_16,37,1254.605481,1947.022718,890.807407,20.672304,36.243575,3301.766740,3.009432,868.842501,...,3.599475e+10,9.471829e+09,3.280781e+07,2.492462e+09,8.633202e+06,29903.038357,6.558782e+08,2.271782e+06,7868.826805,27.255452
9997,Well_79,47,2183.932855,2054.700126,798.767009,24.705318,41.639760,789.730872,3.097366,955.371978,...,4.925353e+08,5.958415e+08,1.931749e+06,7.208155e+08,2.336921e+06,7576.421380,8.720020e+08,2.827076e+06,9165.528325,29.715121
9998,Well_18,26,2286.946065,1502.365694,830.715997,11.728822,39.045014,3983.360502,1.058778,608.284971,...,6.320462e+10,9.651755e+09,1.679980e+07,1.473886e+09,2.565439e+06,4465.390726,2.250719e+08,3.917591e+05,681.894111,1.186902


In [59]:
## tsfresh
!pip install tsfresh

Collecting tsfresh
  Downloading tsfresh-0.21.0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting stumpy>=1.7.2 (from tsfresh)
  Downloading stumpy-1.13.0-py3-none-any.whl.metadata (28 kB)
Collecting cloudpickle (from tsfresh)
  Downloading cloudpickle-3.1.1-py3-none-any.whl.metadata (7.1 kB)
Downloading tsfresh-0.21.0-py2.py3-none-any.whl (96 kB)
Downloading stumpy-1.13.0-py3-none-any.whl (176 kB)
Downloading cloudpickle-3.1.1-py3-none-any.whl (20 kB)
Installing collected packages: cloudpickle, stumpy, tsfresh
Successfully installed cloudpickle-3.1.1 stumpy-1.13.0 tsfresh-0.21.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [60]:
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import impute

In [61]:
# Parte 1: Extraer características

ts_feats = extract_features(df,column_id='WellID',column_sort='Time')

Feature Extraction: 100%|██████████| 10/10 [01:09<00:00,  6.94s/it]


In [62]:
# Parte 2: Imputación

ts_feats = impute(ts_feats)

 'Pressure__fft_coefficient__attr_"real"__coeff_65'
 'Pressure__fft_coefficient__attr_"real"__coeff_66' ...
 'Pososity_Thicknes_Index__fft_coefficient__attr_"angle"__coeff_98'
 'Pososity_Thicknes_Index__fft_coefficient__attr_"angle"__coeff_99'
 'Pososity_Thicknes_Index__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.


In [66]:
type(ts_feats)

pandas.core.frame.DataFrame

In [67]:
ts_feats.to_csv('data/processed_tsfresh_pretroleum_dataset.csv',index=None)