In [30]:
import pandas as pd
import numpy as np 
from scipy.stats import f_oneway
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import RidgeCV, LassoCV

In [40]:
# 데이터 불러오기
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [41]:
y_train = train["SalePrice"]

In [33]:
categorical_feats = train.select_dtypes(include='object').columns.tolist()

In [34]:
anova_results = []
for col in categorical_feats:
    try:
        groups = [group['SalePrice'].dropna().values for name, group in train.groupby(col)]
        if len(groups) > 1:
            f_val, p_val = f_oneway(*groups)
            anova_results.append((col, p_val))
    except:
        continue


In [35]:
# p-value 기준 정렬
anova_sorted = sorted(anova_results, key=lambda x: x[1])
top5 = pd.DataFrame(anova_sorted[:5], columns=["Feature", "P-value"])
print(top5)

        Feature        P-value
0  Neighborhood  1.558600e-225
1     ExterQual  1.439551e-204
2   KitchenQual  3.032213e-192
3      BsmtQual  9.610615e-186
4  GarageFinish   1.199117e-93


In [36]:
# 수치 , 범주형 컬럼 선택 
selected_numerical = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF']
selected_categorical = ['Neighborhood', 'ExterQual', 'KitchenQual', 'BsmtQual', 'GarageFinish']
selected_features = selected_numerical + selected_categorical

In [37]:
X_train = train[selected_features].copy()
X_test = test[selected_features].copy()

In [42]:
# 파생 변수 생성
X_train["Age"] = train["YrSold"] - train["YearBuilt"]
X_test["Age"] = test["YrSold"] - test["YearBuilt"]

X_train["RemodAge"] = train["YrSold"] - train["YearRemodAdd"]
X_test["RemodAge"] = test["YrSold"] - test["YearRemodAdd"]

X_train["TotalBath"] = (
    train["FullBath"] + 0.5 * train["HalfBath"] +
    train["BsmtFullBath"] + 0.5 * train["BsmtHalfBath"]
)
X_test["TotalBath"] = (
    test["FullBath"] + 0.5 * test["HalfBath"] +
    test["BsmtFullBath"] + 0.5 * test["BsmtHalfBath"]
)

X_train["TotalSF"] = (
    train["TotalBsmtSF"] + train["1stFlrSF"] + train["2ndFlrSF"]
)
X_test["TotalSF"] = (
    test["TotalBsmtSF"] + test["1stFlrSF"] + test["2ndFlrSF"]
)


In [43]:
derived_feats = ["Age", "RemodAge", "TotalBath", "TotalSF"]
selected_numerical += derived_feats
selected_features = selected_numerical + selected_categorical

In [44]:
# 범주형 → None
for col in selected_categorical:
    mode = X_train[col].mode()[0]
    X_train[col] = X_train[col].fillna(mode)
    X_test[col] = X_test[col].fillna(mode)

# 수치형 → 평균
for col in selected_numerical:
    mean = X_train[col].mean()
    X_train[col] = X_train[col].fillna(mean)
    X_test[col] = X_test[col].fillna(mean)

In [45]:
X_train_encoded = pd.get_dummies(X_train, columns=selected_categorical, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, columns=selected_categorical, drop_first=True)
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

In [46]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

In [47]:
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error

alphas = np.logspace(-4, 2, 50)
ridge = RidgeCV(alphas=alphas, cv=5)
ridge.fit(X_train_scaled, np.log1p(y_train))

y_pred = ridge.predict(X_train_scaled)
rmse = np.sqrt(mean_squared_error(np.log1p(y_train), y_pred))
print(f"Train RMSE (log scale): {rmse:.4f}")

Train RMSE (log scale): 0.1471


In [48]:
# 예측 (로그 스케일 → 원래 단위로 역변환)
y_test_pred_log = ridge.predict(X_test_scaled)
y_test_pred = np.expm1(y_test_pred_log)  # 역변환

In [49]:
# 제출용 파일 생성
submission = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": y_test_pred
})


In [50]:
# 저장
submission.to_csv("submission_ridge.csv", index=False)
