In [1]:
from semForms.automl_eval.auto_example import handle_transforms
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import statistics
import numpy

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

In [2]:
from sklearn.datasets import fetch_openml
dataset      = fetch_openml('houses', version=1) # name is dataset_name
X            = dataset['data']
target       = dataset['target'].to_frame()

print(X)
print(target)
print(dataset['target_names'])

       median_income  housing_median_age  total_rooms  total_bedrooms  \
0             8.3252                41.0        880.0           129.0   
1             8.3014                21.0       7099.0          1106.0   
2             7.2574                52.0       1467.0           190.0   
3             5.6431                52.0       1274.0           235.0   
4             3.8462                52.0       1627.0           280.0   
...              ...                 ...          ...             ...   
20635         1.5603                25.0       1665.0           374.0   
20636         2.5568                18.0        697.0           150.0   
20637         1.7000                17.0       2254.0           485.0   
20638         1.8672                18.0       1860.0           409.0   
20639         2.3886                16.0       2785.0           616.0   

       population  households  latitude  longitude  
0           322.0       126.0     37.88    -122.23  
1          2401.0

In [3]:
# Set standard classifier (could be any AutoML as well)
estimator = RandomForestRegressor(random_state = 1908)
# Evaluate on original data
scores = cross_val_score(estimator, X, numpy.ravel(target), cv=3, scoring='r2')
print("Averaged r2 score on original data:  " + str(statistics.mean(scores)))

Averaged r2 score on original data:  0.5814812832363954


In [4]:
import requests
import json
with open('example/test.json') as f:
    data = json.load(f)
response = requests.post('http://localhost:4567/index', json=data)
json_example = response.json()

json_example

[{'node_number': 17,
  'json_file': 'test.py',
  'code': "lambda df: (df[ 'population' ] / df[ 'households' ])",
  'expr_name': 'expr_12',
  'writtenFields': ['popdf'],
  'csvfiles': ['houses.csv'],
  'fields': ['total_bedrooms', 'total_rooms', 'population', 'households'],
  'source_code': "houses_df['population' ] / houses_df['households']",
  'source_file': 'turtle14001999222773221790.py'},
 {'node_number': 14,
  'json_file': 'test.py',
  'code': "lambda df: (df[ 'total_bedrooms' ] / df[ 'total_rooms' ])",
  'expr_name': 'expr_13',
  'writtenFields': ['beds_to_total'],
  'csvfiles': ['houses.csv'],
  'fields': ['total_bedrooms', 'total_rooms', 'population', 'households'],
  'source_code': "houses_df['total_bedrooms'] / houses_df['total_rooms']",
  'source_file': 'turtle14001999222773221790.py'}]

In [5]:
# Analyze given transforms and if applicable create SKLEARN Function Transforms as a pipeline
transforms_suggested, correlation = handle_transforms("both", json_example, target, X, 'houses.csv')

------------------------------------------------------
Dataset columns (X): ['median_income', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'latitude', 'longitude']
Dataset columns (Y): ['median_house_value']
correlation matrix:
                    median_income  housing_median_age  total_rooms  \
median_income            1.000000            0.119034     0.198050   
housing_median_age       0.119034            1.000000     0.361262   
total_rooms              0.198050            0.361262     1.000000   
total_bedrooms           0.008093            0.320485     0.929893   
population               0.004834            0.296244     0.857126   
households               0.013033            0.302916     0.918484   
latitude                 0.079809            0.011173     0.036100   
longitude                0.015176            0.108197     0.044568   
expr_12                  0.018766            0.013191     0.024581   

                    total_bedroom

In [6]:
# Pipeline of suggested transforms (as a SKLEARN functional transformer)
transforms_suggested

[('expr_12',
  FunctionTransformer(func=<function wrapper_func.<locals>.df_func at 0x7f6bf85d6b80>)),
 ('expr_13',
  FunctionTransformer(func=<function wrapper_func.<locals>.df_func at 0x7f6ad3c05430>))]

In [7]:
# Add estimator to suggested transformation pipeline
transforms_suggested.append(('estimator', estimator))
pipeline = Pipeline(transforms_suggested)

In [8]:
# Evaluate with data augmentation added as function transformers based on original data
scores = cross_val_score(pipeline, X, numpy.ravel(target), cv=3, scoring='r2')
print("Averaged r2 score with augmentations on original data:  " + str(statistics.mean(scores)))

Averaged r2 score with augmentations on original data:  0.6524868110538516
