In [2]:
train_df = pd.read_csv('my_train.csv')
train_df.shape

(1314, 81)

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1314 entries, 0 to 1313
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1314 non-null   int64  
 1   MSSubClass     1314 non-null   int64  
 2   MSZoning       1314 non-null   object 
 3   LotFrontage    1075 non-null   float64
 4   LotArea        1314 non-null   int64  
 5   Street         1314 non-null   object 
 6   Alley          84 non-null     object 
 7   LotShape       1314 non-null   object 
 8   LandContour    1314 non-null   object 
 9   Utilities      1314 non-null   object 
 10  LotConfig      1314 non-null   object 
 11  LandSlope      1314 non-null   object 
 12  Neighborhood   1314 non-null   object 
 13  Condition1     1314 non-null   object 
 14  Condition2     1314 non-null   object 
 15  BldgType       1314 non-null   object 
 16  HouseStyle     1314 non-null   object 
 17  OverallQual    1314 non-null   int64  
 18  OverallC

In [6]:
dev_df = pd.read_csv('my_dev.csv')
dev_df.shape

(146, 81)

In [7]:
dev_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             146 non-null    int64  
 1   MSSubClass     146 non-null    int64  
 2   MSZoning       146 non-null    object 
 3   LotFrontage    126 non-null    float64
 4   LotArea        146 non-null    int64  
 5   Street         146 non-null    object 
 6   Alley          7 non-null      object 
 7   LotShape       146 non-null    object 
 8   LandContour    146 non-null    object 
 9   Utilities      146 non-null    object 
 10  LotConfig      146 non-null    object 
 11  LandSlope      146 non-null    object 
 12  Neighborhood   146 non-null    object 
 13  Condition1     146 non-null    object 
 14  Condition2     146 non-null    object 
 15  BldgType       146 non-null    object 
 16  HouseStyle     146 non-null    object 
 17  OverallQual    146 non-null    int64  
 18  OverallCon

In [38]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [40]:
# Load the datasets
train_df = pd.read_csv("my_train.csv")
validation_df = pd.read_csv("my_dev.csv")
test_df = pd.read_csv("test.csv")

# Separate features (X) and target (y)
X_train = train_df.drop(columns=["Id", "SalePrice"])
y_train_log = np.log(train_df["SalePrice"])  # Apply log transformation to the target variable
X_val = validation_df.drop(columns=["Id", "SalePrice"])
y_val = validation_df["SalePrice"]  # Keep the validation target in the original scale
X_test = test_df.drop(columns=["Id"])

# Identify numerical and categorical columns
cat_features = X_train.select_dtypes(include=['object']).columns
num_features = X_train.select_dtypes(exclude=['object']).columns

In [34]:
# Preprocessing pipeline for numeric and categorical features
preprocess_pipeline = ColumnTransformer(
    transformers=[
        ("num_pipeline", Pipeline([
            ("num_imputer", SimpleImputer(strategy="mean")),  # Fill missing numeric values with the mean
            ("num_scaler", StandardScaler())
        ]), num_features),
        ("cat_pipeline", Pipeline([
            ("cat_imputer", SimpleImputer(strategy="most_frequent")),  # Fill missing categorical values
            ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_features)
    ]
)

# Combine preprocessing and regression model in a pipeline
regression_model = Pipeline(steps=[
    ("preprocessing", preprocess_pipeline),
    ("linear_model", LinearRegression())
])

# Fit the model on the training data
regression_model.fit(X_train, y_train_log)

In [35]:
# Transform the training data to see the output dimensions of the preprocessing step
X_train_transformed = preprocess_pipeline.fit_transform(X_train)
total_features = X_train_transformed.shape[1]
print(f"Total number of features after preprocessing: {total_features}")

Total number of features after preprocessing: 286


In [36]:
# Get the coefficients from the linear regression model
model_coefficients = regression_model.named_steps["linear_model"].coef_

# Retrieve feature names from the preprocessing pipeline
# Numerical feature names
numerical_feature_names = list(num_features)

# Categorical feature names from one-hot encoding
categorical_feature_names = preprocess_pipeline.named_transformers_["cat_pipeline"] \
                             .named_steps["one_hot_encoder"].get_feature_names_out(cat_features)

# Combine numerical and categorical feature names
all_feature_names = numerical_feature_names + list(categorical_feature_names)

# Create a DataFrame for feature importance
feature_importance = pd.DataFrame({
    "Feature": all_feature_names,
    "Coefficient": model_coefficients
})

# Sort and get the top 10 positive and top 10 negative features
top_positive_features = feature_importance.nlargest(10, "Coefficient")
top_negative_features = feature_importance.nsmallest(10, "Coefficient")

# Print the results
print("Top 10 Positive Features:")
print(top_positive_features)

print("\nTop 10 Negative Features:")
print(top_negative_features)

Top 10 Positive Features:
              Feature  Coefficient
126  RoofMatl_Membran     0.639415
127    RoofMatl_Metal     0.491167
100   Condition2_PosA     0.441535
123    RoofStyle_Shed     0.345296
128     RoofMatl_Roll     0.307944
131  RoofMatl_WdShngl     0.303081
247     GarageQual_Ex     0.280938
129  RoofMatl_Tar&Grv     0.276326
125  RoofMatl_CompShg     0.253371
267  MiscFeature_Gar2     0.230099

Top 10 Negative Features:
                 Feature  Coefficient
124     RoofMatl_ClyTile    -2.473988
101      Condition2_PosN    -0.731932
102      Condition2_RRAe    -0.499126
36      MSZoning_C (all)    -0.328501
252        GarageCond_Ex    -0.247543
231       Functional_Sev    -0.208653
227      Functional_Maj2    -0.205660
134  Exterior1st_BrkComm    -0.196562
270     MiscFeature_TenC    -0.174267
208         Heating_Grav    -0.167343


In [37]:
# Make predictions on the validation set and convert to original scale
y_val_pred_log = regression_model.predict(X_val)
y_val_pred = np.exp(y_val_pred_log)  # Reverse the log transformation

# Calculate RMSLE on the validation set
rmsle_val = np.sqrt(mean_squared_log_error(y_val, y_val_pred))
print(f"RMSLE on validation set: {rmsle_val}")

# Make predictions on the test set and convert to original scale
y_test_pred_log = regression_model.predict(X_test)
y_test_pred = np.exp(y_test_pred_log)  # Reverse the log transformation

# Create a DataFrame for test predictions
submission_df = pd.DataFrame({"Id": test_df["Id"], "SalePrice": y_test_pred})

# Save the predictions to a CSV file
submission_df.to_csv("prediction_part3.csv", index=False)
print("\nPredicted file 'prediction_part3.csv' created.")

RMSLE on validation set: 0.12394186786345536

Predicted file 'prediction_part3.csv' created.
