# Data Card for Soybean Dataset

## Dataset Information

In [55]:
from openml import datasets
from pymfe.mfe import MFE
import pandas as pd
import numpy as np

# Download the dataset
dataset = datasets.get_dataset(42)  
# dataset = datasets.get_dataset(1023)  # Uncomment this line for version v2
X, y, categorical_indicator, attribute_names = dataset.get_data(
    dataset_format="array", target=dataset.default_target_attribute
)

print("Target attribute:", dataset.default_target_attribute)

# Converting to df
df = pd.DataFrame(X, columns=attribute_names)
df = pd.DataFrame(X, columns=attribute_names)
df['class'] = y

# Saving the dataset (in data directory)
df.to_csv('data/soybean_v1.csv', index=False)
print("\nDataset saved to 'data/soybean_v1.csv'")





Target attribute: class

Dataset saved to 'data/soybean_v1.csv'


In [60]:
import pandas as pd
import numpy as np
from pymfe.mfe import MFE

results = {}

results['nr_instances'] = df.shape[0]
results['nr_attributes'] = df.shape[1] - 1  
results['attr_to_inst'] = (df.shape[1] - 1) / df.shape[0]

# Class information
results['nr_classes'] = df['class'].nunique()
results['freq_class'] = df['class'].value_counts().max() / len(df)

# Categorical data specific information
results['avg_categories'] = df.drop('class', axis=1).nunique().mean()
results['max_categories'] = df.drop('class', axis=1).nunique().max()
results['min_categories'] = df.drop('class', axis=1).nunique().min()

# Most common values
results['most_common_value'] = df.drop('class', axis=1).mode().iloc[0].mode()[0]
results['dna_percentage'] = (df.drop('class', axis=1) == 'dna').mean().mean() * 100
results['absent_percentage'] = (df.drop('class', axis=1) == 'absent').mean().mean() * 100

# Try to extract some MFE features individually
safe_mfe_features = ['class_ent', 'mut_inf']
for feature in safe_mfe_features:
    try:
        mfe = MFE(features=[feature], random_state=42)
        X_values = df.drop('class', axis=1).values
        y_values = df['class'].values  # Convert to numpy array
        mfe.fit(X_values, y_values)
        ft = mfe.extract()
        results[ft[0][0]] = ft[1][0]
    except Exception as e:
        print(f"Couldn't extract {feature}: {str(e)}")

meta_features = pd.DataFrame(list(results.items()), columns=['Feature', 'Value'])
print(meta_features)


              Feature         Value
0        nr_instances  6.830000e+02
1       nr_attributes  3.500000e+01
2        attr_to_inst  5.124451e-02
3          nr_classes  1.900000e+01
4          freq_class  1.346999e-01
5      avg_categories  2.828571e+00
6      max_categories  7.000000e+00
7      min_categories  2.000000e+00
8   most_common_value  0.000000e+00
9      dna_percentage  0.000000e+00
10  absent_percentage  0.000000e+00
11          class_ent  3.835508e+00
12       mut_inf.mean -6.095884e-07
