# Set Up

In [16]:
### Packages ###
import os
import pickle
import kagglehub
import pandas as pd 
from pmlb import fetch_data

# ### Append Path ###
# import sys
# sys.path.append('..')

# ### Import functions ###
# from utils.Auxiliary.BurbridgeDGP import *
# from utils.Auxiliary.PreprocessData import *


### Save Path ###
save_path = '/Users/simondn/Documents/WeightedGreedySampling/Data/processed/'
os.makedirs(save_path, exist_ok=True)

---
---

---
---

# Concrete Compression

In [4]:
### Concrete ###
concrete_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls'
df_concrete = pd.read_excel(concrete_url)
df_concrete = df_concrete.rename(columns={'Concrete compressive strength(MPa, megapascals) ': 'Y'})

# Concrete (CS, Flow, Slump)


In [5]:
### Set up ###
slump_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/slump/slump_test.data'
df_slump_base = pd.read_csv(slump_url)

### Create Version 1: Concrete-CS ###
df_concrete_cs = df_slump_base.drop(columns=['FLOW(cm)', 'SLUMP(cm)'])
df_concrete_cs = df_concrete_cs.rename(columns={'Compressive Strength (28-day)(Mpa)': 'Y'})

### Create Version 2: Concrete-Flow ###
df_concrete_flow = df_slump_base.drop(columns=['Compressive Strength (28-day)(Mpa)', 'SLUMP(cm)'])
df_concrete_flow = df_concrete_flow.rename(columns={'FLOW(cm)': 'Y'})

### Create Version 3: Concrete-Slump ###
df_concrete_slump = df_slump_base.drop(columns=['Compressive Strength (28-day)(Mpa)', 'FLOW(cm)'])
df_concrete_slump = df_concrete_slump.rename(columns={'SLUMP(cm)': 'Y'})

# Yacht

In [6]:
### Yacht ###
yacht_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00243/yacht_hydrodynamics.data'
yacht_columns = [
    'longitudinal_pos',
    'prismatic_coeff',
    'length_displacement_ratio',
    'beam_draught_ratio',
    'length_beam_ratio',
    'froude_number',
    'Y']
df_yacht = pd.read_csv(yacht_url, sep=r'\s+', header=None, names=yacht_columns)


# Housing

In [7]:
### Housing ###
housing_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
housing_columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df_housing = pd.read_csv(housing_url, sep=r'\s+', header=None, names=housing_columns)
df_housing = df_housing.rename(columns={'MEDV': 'Y'})

# Auto MPG

In [8]:
### MPG ###
mpg_column_names = [
    'Y',
    'cylinders',
    'displacement',
    'horsepower',
    'weight',
    'acceleration',
    'model_year',
    'origin',
    'car_name'
    ]
mpg_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
df_auto_mpg = pd.read_csv(mpg_url,
                          sep=r'\s+',
                          header=None,
                          names=mpg_column_names,
                          na_values='?')
del df_auto_mpg['car_name']
df_auto_mpg.dropna(inplace=True)

# Wine (Red and White)

In [9]:
### Red ###
url_red = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
df_wine_red = pd.read_csv(url_red, sep=';')
df_wine_red = df_wine_red.rename(columns={'quality': 'Y'})

### White ###
url_white = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
df_wine_white = pd.read_csv(url_white, sep=';')
df_wine_white = df_wine_white.rename(columns={'quality': 'Y'})

# CPS

In [None]:
### CPS ###
cps_url = 'http://lib.stat.cmu.edu/datasets/CPS_85_Wages'
df_cps = pd.read_csv(cps_url, sep=r'\s+', skiprows=27)
df_cps.columns = [
    "EDUCATION",
    "SOUTH",
    "SEX",
    "EXPERIENCE",
    "UNION",
    "WAGE",
    "AGE",
    "RACE",
    "OCCUPATION",
    "SECTOR",
    "MARR"
]
df_cps = df_cps.rename(columns={'WAGE': 'Y'})

KeyboardInterrupt: 

# NO2 and PM10

In [10]:
### PM10 ###
df_pm10 = fetch_data('529_pollen', return_X_y=False) # This is a common substitute
df_pm10 = df_pm10.rename(columns={'target': 'Y'})

### NO2 ###
df_no2 = fetch_data('560_bodyfat', return_X_y=False) # Note: Names in pmlb can differ
df_no2 = df_no2.rename(columns={'target': 'Y'})

# QSAR 

In [11]:
### QSAR ###
qsar_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00505/qsar_aquatic_toxicity.csv'
df_qsar = pd.read_csv(qsar_url, sep=';', header=None)
qsar_column_names = ['TPSA', 'SAacc', 'H050', 'MLOGP', 'RDCHI','GATS1p', 'nN', 'C040', 'Y']
df_qsar.columns = qsar_column_names

# Body Fat

In [12]:
### Body Fat ###
download_path = kagglehub.dataset_download("fedesoriano/body-fat-prediction-dataset")
csv_file_path = os.path.join(download_path, 'bodyfat.csv')
df_bodyfat = pd.read_csv(csv_file_path)
if 'Density' in df_bodyfat.columns:
    del df_bodyfat['Density']
df_bodyfat = df_bodyfat.rename(columns={'BodyFat': 'Y'})

# Beer Consumption

In [13]:
### Beer Consumption ####
download_path = kagglehub.dataset_download("dongeorge/beer-consumption-sao-paulo")
csv_file_path = os.path.join(download_path, 'Consumo_cerveja.csv')
df_beer = pd.read_csv(csv_file_path, decimal=',')
df_beer.dropna(inplace=True)
df_beer.columns = ['Date', 'Temp_Avg_C', 'Temp_Min_C', 'Temp_Max_C', 'Precipitation_mm', 'Weekend', 'Y']
df_beer["Y"] = df_beer['Y'].astype(float)
del df_beer['Date']

# Save

In [23]:
# ### File Names ###
# datasets_to_save = {
#     'concrete_cs': df_concrete_cs,
#     'concrete_flow': df_concrete_flow,
#     'concrete_slump': df_concrete_slump,
#     'yacht': df_yacht,
#     'housing': df_housing,
#     'mpg': df_auto_mpg,
#     'concrete_4': df_concrete,
#     'wine_red': df_wine_red,
#     'wine_white': df_wine_white,
#     # 'cps': df_cps,
#     'no2': df_no2,
#     'pm10': df_pm10,
#     'qsar': df_qsar,
#     'bodyfat': df_bodyfat,
#     'beer': df_beer
#     }

### Save datasets ###
for name, dataframe in datasets_to_save.items():
    file_path = os.path.join(save_path, f"{name}.pkl")
    with open(file_path, 'wb') as file:
        pickle.dump(dataframe, file)
    print(f"Successfully saved: {name}.pkl")

Successfully saved: concrete_cs.pkl
Successfully saved: concrete_flow.pkl
Successfully saved: concrete_slump.pkl
Successfully saved: yacht.pkl
Successfully saved: housing.pkl
Successfully saved: mpg.pkl
Successfully saved: concrete_4.pkl
Successfully saved: wine_red.pkl
Successfully saved: wine_white.pkl
Successfully saved: no2.pkl
Successfully saved: pm10.pkl
Successfully saved: qsar.pkl
Successfully saved: bodyfat.pkl
Successfully saved: beer.pkl
Successfully saved: dgp_correct.pkl
Successfully saved: dgp_misspecified.pkl
Successfully saved: dgp_low_noise.pkl
