<a href="https://colab.research.google.com/github/zzc029498-max/nec-/blob/main/A1_part1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_openml


try:
    housing = fetch_openml(name="house_prices", as_frame=True)
    df = housing.frame
except Exception as e:
    print(f"Unable to load the dataset: {e}")



features_to_use = [

    'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'GrLivArea',
    'FullBath', 'BedroomAbvGr',

    'MSZoning', 'Street', 'LotShape', 'Neighborhood', 'BldgType'
]
target_variable = 'SalePrice'


df = df[features_to_use + [target_variable]].copy()


print(f"Size of the original dataset: {df.shape[0]} ")
if df.shape[0] < 1000:
    print("Warning: The dataset contains fewer than 1,000 samples.")

X = df[features_to_use]
y = df[target_variable]


print(f"\nstarting processing...\n")


numerical_cols = X.select_dtypes(include=np.number).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

print(f"Numeric features ({len(numerical_cols)}): {list(numerical_cols)}")
print(f"Category-type characteristics ({len(categorical_cols)}): {list(categorical_cols)}")


print("\nMissing values in each column:")
print(X.isnull().sum())


numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
)


X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=42
)

print(f"\nData Splitting:")
print(f"Training/Validation Set Size: {X_train_val.shape[0]} (80%)")
print(f"Test set size: {X_test.shape[0]} (20%)")




print("\nApplying preprocessing to the training/validation set...")
X_train_val_processed = preprocessor.fit_transform(X_train_val)


print("Applying preprocessing to the test set")
X_test_processed = preprocessor.transform(X_test)



y_train_val_processed = np.log(y_train_val)
y_test_processed = np.log(y_test)

print("\nPreprocessing completed")


try:

    np.savez_compressed('preprocessed_data.npz',
                        X_train_val=X_train_val_processed,
                        y_train_val=y_train_val_processed,
                        X_test=X_test_processed,
                        y_test=y_test_processed)


    feature_names = preprocessor.get_feature_names_out()
    np.save('feature_names.npy', feature_names)

    print("\nPreprocessing files have been generated.:")
    print("- preprocessed_data.npz (include X_train_val, y_train_val, X_test, y_test)")
    print("- feature_names.npy (Includes processed feature names)")

except Exception as e:
    print(f"An error occurred while saving the file.: {e}")

Size of the original dataset: 1460 

starting processing...

Numeric features (7): ['LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'GrLivArea', 'FullBath', 'BedroomAbvGr']
Category-type characteristics (5): ['MSZoning', 'Street', 'LotShape', 'Neighborhood', 'BldgType']

Missing values in each column:
LotArea         0
OverallQual     0
OverallCond     0
YearBuilt       0
GrLivArea       0
FullBath        0
BedroomAbvGr    0
MSZoning        0
Street          0
LotShape        0
Neighborhood    0
BldgType        0
dtype: int64

Data Splitting:
Training/Validation Set Size: 1168 (80%)
Test set size: 292 (20%)

Applying preprocessing to the training/validation set...
Applying preprocessing to the test set

Preprocessing completed

Preprocessing files have been generated.:
- preprocessed_data.npz (include X_train_val, y_train_val, X_test, y_test)
- feature_names.npy (Includes processed feature names)
