In [1]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import altair as alt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
import requests
import zipfile

url = "https://archive.ics.uci.edu/static/public/1/abalone.zip"

request = requests.get(url)
with open("../data/raw/abalone.zip", 'wb') as f:
    f.write(request.content)

with zipfile.ZipFile("../data/raw/abalone.zip", 'r') as zip_ref:
    zip_ref.extractall("../data/raw")

In [8]:
abalone = fetch_ucirepo(id=1) 

In [13]:
import pandas as pd
import pandera.pandas as pa

In [16]:
# Extract features and targets
X = abalone.data.features
y = abalone.data.targets

# Split Data (Same random_state as baseline for comparison)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=522
)

train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

In [24]:
#This schema checks column types, and that there are no NULL values in the feature columns. 
 
schema = pa.DataFrameSchema({
    "Sex": pa.Column(str, nullable=False),
    "Length": pa.Column(float, nullable=False),
    "Diameter": pa.Column(float, nullable=False),
    "Height": pa.Column(float, nullable=False),
    "Whole_weight": pa.Column(float, nullable=False),
    "Shucked_weight": pa.Column(float, nullable=False),
    "Viscera_weight": pa.Column(float, nullable=False),
    "Shell_weight": pa.Column(float, nullable=False),
    "Rings": pa.Column(int)   
}
)

schema.validate(train_df, lazy=True)

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
2194,I,0.430,0.325,0.110,0.3675,0.1355,0.0935,0.1200,13
3996,I,0.315,0.230,0.000,0.1340,0.0575,0.0285,0.3505,6
3329,F,0.545,0.435,0.150,0.6855,0.2905,0.1450,0.2250,10
492,F,0.655,0.510,0.155,1.2895,0.5345,0.2855,0.4100,11
241,I,0.270,0.200,0.070,0.1000,0.0340,0.0245,0.0350,5
...,...,...,...,...,...,...,...,...,...
3956,F,0.515,0.395,0.140,0.6860,0.2810,0.1255,0.2200,12
154,F,0.565,0.450,0.135,0.9885,0.3870,0.1495,0.3100,12
3360,F,0.580,0.440,0.175,1.0730,0.4005,0.2345,0.3350,19
1899,M,0.575,0.450,0.130,0.7850,0.3180,0.1930,0.2265,9


In [25]:
#This checks that there are not more than 5% missing values in the target column

schema = pa.DataFrameSchema(
    {
        "Rings": pa.Column(int,
                                pa.Check(lambda s: s.isna().mean() <= 0.05,
                                    element_wise=False,
                                    error="Too many null values in 'Rings' column."),
                                nullable=True)
    }
)

schema.validate(train_df, lazy=True)

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
2194,I,0.430,0.325,0.110,0.3675,0.1355,0.0935,0.1200,13
3996,I,0.315,0.230,0.000,0.1340,0.0575,0.0285,0.3505,6
3329,F,0.545,0.435,0.150,0.6855,0.2905,0.1450,0.2250,10
492,F,0.655,0.510,0.155,1.2895,0.5345,0.2855,0.4100,11
241,I,0.270,0.200,0.070,0.1000,0.0340,0.0245,0.0350,5
...,...,...,...,...,...,...,...,...,...
3956,F,0.515,0.395,0.140,0.6860,0.2810,0.1255,0.2200,12
154,F,0.565,0.450,0.135,0.9885,0.3870,0.1495,0.3100,12
3360,F,0.580,0.440,0.175,1.0730,0.4005,0.2345,0.3350,19
1899,M,0.575,0.450,0.130,0.7850,0.3180,0.1930,0.2265,9


In [30]:
# checking that numeric features are within range; no extreme outliers

schema = pa.DataFrameSchema(
    {
        "Length": pa.Column(float, pa.Check.between(0, 1)),
        "Diameter": pa.Column(float, pa.Check.between(0, 1)),
        "Height": pa.Column(float, pa.Check.between(0, 1)),
        "Whole_weight": pa.Column(float, pa.Check.between(0, 3)),
        "Shucked_weight": pa.Column(float, pa.Check.between(0, 2)),
        "Viscera_weight": pa.Column(float, pa.Check.between(0, 1)),
        "Shell_weight": pa.Column(float, pa.Check.between(0, 0.5)),
        "Rings": pa.Column(int, pa.Check.between(0, 30))
    }
)

schema.validate(train_df, lazy=True)

SchemaErrors: {
    "DATA": {
        "DATAFRAME_CHECK": [
            {
                "schema": null,
                "column": "Shell_weight",
                "check": "in_range(0, 0.5)",
                "error": "Column 'Shell_weight' failed element-wise validator number 0: in_range(0, 0.5) failure cases: 0.51, 0.638, 0.63, 0.58, 0.505, 0.625, 0.5235, 0.7975, 0.885, 0.725, 0.52, 0.665, 0.61, 0.558, 0.62, 0.566, 0.557, 0.59, 0.6, 0.53, 0.512, 0.515, 0.501, 0.547, 0.621, 0.553, 0.612, 0.5215, 0.578, 0.665, 0.575, 0.5085, 0.5285, 0.6095, 0.511, 0.71, 0.815, 0.57, 0.7, 0.595, 0.78, 0.58, 0.5365, 0.5655, 0.516, 0.53, 0.585, 0.555, 0.503, 0.52, 0.6745, 0.55, 0.76, 0.62, 0.6, 0.585, 0.52, 0.505, 0.85, 0.58, 0.635, 0.506, 0.54, 0.58, 0.5175, 0.5295, 0.5015, 0.57, 0.595, 0.726, 0.5855, 0.508, 0.624, 1.005, 0.54, 0.625, 0.5015, 0.5965, 0.502, 0.605, 0.545, 0.5305, 0.725, 0.528, 0.505, 0.6585, 0.69, 0.545, 0.6205, 0.512, 0.565, 0.5675, 0.52, 0.512, 0.586, 0.6855, 0.53, 0.525, 0.54, 0.565, 0.62, 0.57, 0.565, 0.657, 0.6, 0.602, 0.535, 0.885, 0.52, 0.642, 0.52, 0.51, 0.525, 0.515, 0.897, 0.545, 0.6785, 0.515"
            }
        ]
    }
}

In [None]:
#checking that the 'sex' column only has the values M, F, or I

schema = pa.DataFrameSchema(
    {
        "class": pa.Column(str, pa.Check.isin(["Benign", "Malignant"])),
        "mean_radius": pa.Column(float, pa.Check.between(5, 45), nullable=True)
    }
)

In [None]:
#checking for duplicates

schema = pa.DataFrameSchema(
    {
        "class": pa.Column(str, pa.Check.isin(["Benign", "Malignant"])),
        "mean_radius": pa.Column(float, pa.Check.between(5, 45), nullable=True)
    },
    checks=[
        pa.Check(lambda df: ~df.duplicated().any(), error="Duplicate rows found.")
    ]
)

In [None]:
#checking for empty observations

schema = pa.DataFrameSchema(
    {
        "class": pa.Column(str, pa.Check.isin(["Benign", "Malignant"])),
        "mean_radius": pa.Column(float, pa.Check.between(5, 45), nullable=True)
    },
    checks=[
        pa.Check(lambda df: ~df.duplicated().any(), error="Duplicate rows found."),
        pa.Check(lambda df: ~(df.isna().all(axis=1)).any(), error="Empty rows found.")
    ]
)