In [1]:
import numpy as np
import pandas as pd
from src.datasets_generator import DatasetsGenerator

### 0. Get Some Data

In [2]:
x, y = DatasetsGenerator.get_generic_regression_dataset()

In [3]:
print(x.shape[0])

2000


In [4]:
x.head(3)

Unnamed: 0,X_1,X_2,X_3,X_4,X_5
0,-0.660707,-2.165475,-0.966875,-0.496182,-1.439579
1,-1.488974,0.34389,0.566556,-1.047448,1.306438
2,-1.086959,-1.010035,0.421872,-0.201817,1.787369


In [5]:
y.head(3)

Unnamed: 0,Y_1
0,0.172519
1,1.635483
2,0.037336


## 1. pd.DataFrame.query

In [6]:
x1 = x.copy(deep=True)

#### 1.1. filtering based on columns only

In [7]:
temp = x1.query('X_1 >= X_2')
temp.head()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5
0,-0.660707,-2.165475,-0.966875,-0.496182,-1.439579
3,0.851111,-1.646772,0.489804,0.228755,0.242545
7,-0.090197,-0.157473,-1.794185,1.775847,-0.11373
8,0.995375,0.038744,0.724757,-0.081434,-0.067969
9,0.975762,0.345924,-0.321714,0.455418,1.289285


#### 1.2. filtering based on transformations of columns

In [8]:
temp = x1.query('X_1**2 >= X_2**2')
temp.head()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5
1,-1.488974,0.34389,0.566556,-1.047448,1.306438
2,-1.086959,-1.010035,0.421872,-0.201817,1.787369
8,0.995375,0.038744,0.724757,-0.081434,-0.067969
9,0.975762,0.345924,-0.321714,0.455418,1.289285
11,0.595217,0.568308,-0.159772,1.572464,2.136379


#### 1.3. filtering based on a variable from namespace

In [9]:
threshold = 1
temp = x1.query('X_1 >= @threshold')
temp.head()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5
12,1.401164,0.640416,-0.948273,-0.612452,-0.056798
13,1.439026,0.242398,-0.575685,-0.022274,-0.062724
24,1.089922,0.918648,-0.192901,-0.614372,-0.722392
27,1.618858,0.306108,-0.039718,0.583851,-0.49469
32,1.624671,0.508565,1.075318,0.154416,-1.827894


## 2. pandas.DataFrame.assign

In [10]:
x2 = x.copy(deep=True)
x2.head()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5
0,-0.660707,-2.165475,-0.966875,-0.496182,-1.439579
1,-1.488974,0.34389,0.566556,-1.047448,1.306438
2,-1.086959,-1.010035,0.421872,-0.201817,1.787369
3,0.851111,-1.646772,0.489804,0.228755,0.242545
4,0.163805,0.195555,-0.003411,-0.029576,-0.566782


In [11]:
x2 = x2.assign(calculate_col = x2.X_1 + x2.X_2**3 + 3*x2.X_4 - 20*x2.X_5)
x2.head()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5,calculate_col
0,-0.660707,-2.165475,-0.966875,-0.496182,-1.439579,16.487793
1,-1.488974,0.34389,0.566556,-1.047448,1.306438,-30.719404
2,-1.086959,-1.010035,0.421872,-0.201817,1.787369,-38.470209
3,0.851111,-1.646772,0.489804,0.228755,0.242545,-7.779338
4,0.163805,0.195555,-0.003411,-0.029576,-0.566782,11.418186


## 3. pandas.DataFrame.pipe

In [12]:
x3 = x.copy(deep=True)

In [13]:
def square_and_shift(data: pd.DataFrame, col: str, shift: float):
    data[col] = data[col]**2 + shift
    return data

In [14]:
def sinh(data: pd.DataFrame, col: str):
    data[col] = np.sinh(data[col])
    return data

In [15]:
x3.head()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5
0,-0.660707,-2.165475,-0.966875,-0.496182,-1.439579
1,-1.488974,0.34389,0.566556,-1.047448,1.306438
2,-1.086959,-1.010035,0.421872,-0.201817,1.787369
3,0.851111,-1.646772,0.489804,0.228755,0.242545
4,0.163805,0.195555,-0.003411,-0.029576,-0.566782


In [16]:
x3.pipe(square_and_shift, col="X_1", shift=10).pipe(sinh, col="X_2").head()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5
0,10.436533,-4.302024,-0.966875,-0.496182,-1.439579
1,12.217045,0.350709,0.566556,-1.047448,1.306438
2,11.18148,-1.190746,0.421872,-0.201817,1.787369
3,10.724389,-2.498765,0.489804,0.228755,0.242545
4,10.026832,0.196804,-0.003411,-0.029576,-0.566782
