### Step 1: The Dataset

ชุดข้อมูล "Boston Housing" เป็นชุดข้อมูลที่รู้จักกันดีสำหรับโจทย์ Regression ชุดข้อมูลนี้ประกอบด้วยราคาของที่อยู่อาศัยพร้อมกับฟีเจอร์ต่างๆ เช่น อัตราอาชญากรรม จำนวนห้องโดยเฉลี่ย ฯลฯ ใช้เพื่อคาดการณ์ราคาที่อยู่อาศัย

### Step 2: Read Data (1 point)
โหลด housing.csv (Boston Housing dataset) ลงใน Pandas DataFrame โดยตั้งชื่อตัวแปรของ DataFrame นั้นว่า housing_data.

In [1]:
### BEGIN SOLUTION
import pandas as pd
housing_data = pd.read_csv('housing.csv')
### END SOLUTION
print(housing_data.head())
print(housing_data.info())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639

In [2]:
assert len(housing_data) == 20640, "Incorrect number of rows"

### Step 3: Exploratory Data Analysis (EDA) and Numerical Results (3 points)
จงคำนวณค่าทางสถิติดังต่อไปนี้
- Mean ของ median_income
- Max ของ housing_median_age
- Sum ของ total_rooms

In [3]:
# Calculate summary statistics

### BEGIN SOLUTION
median_income_mean = housing_data['median_income'].mean()
housing_median_age_max = housing_data['housing_median_age'].max()
total_rooms_sum = housing_data['total_rooms'].sum()
### END SOLUTION

print(median_income_mean)
print(housing_median_age_max)
print(total_rooms_sum)

3.8706710029069766
52.0
54402150.0


In [4]:
assert round(median_income_mean, 2) == 3.87, "Incorrect mean median_income"
assert housing_median_age_max == 52, "Incorrect max housing_median_age"
assert total_rooms_sum == 54402150, "Incorrect total_rooms sum"

### Step 4: Setup Experiment and Model Training (3 points)
สร้าง Regression Model ที่สามารถทำนายราคาที่อยู่อาศัย (median_house_value) สำหรับ บรรทัด ที่ 100, 200, และ 300

In [5]:
# Import important libraries

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# Remove the target variable
A = housing_data.drop('median_house_value', axis=1)
y = housing_data['median_house_value']

### สร้าง matrix X ใหม่จาก A โดยแทนที่ข้อมูล ocean_proximity จากเดิมที่เป็น categorical ให้เป็น OneHotEncoding

In [10]:
### BEGIN SOLUTION
categorical_features = ['ocean_proximity']
numeric_features = list(A.columns.difference(categorical_features))
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

X = preprocessor.fit_transform(A)
### END SOLUTION
print(X.shape)

(20640, 13)


In [11]:
assert X.shape[1] == 13, "Incorrect value of shape"

In [12]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess and train the linear regression model
### BEGIN SOLUTION
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])
### END SOLUTION

model.fit(X_train, y_train)
y_predict=model.predict(X_test.iloc[[100,200,300],:])
y_predict

ValueError: Specifying the columns using strings is only supported for pandas DataFrames

In [None]:
def mycheck(yp,yt,error_percent=10):
    p=error_percent/100
    return yt*(1-p)<yp and yp < yt*(1+p)

assert len(X_train) == 16512, "Incorrect number of training samples"
assert len(X_test) == 4128, "Incorrect number of test samples"
assert mycheck(y_predict[0],134173.854), "Incorrect y_prediction[0]"
assert mycheck(y_predict[1],171135.568), "Incorrect y_prediction[1]"
assert mycheck(y_predict[2],215393.475), "Incorrect y_prediction[2]"


### Step 5: Evaluation (3 points)

In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error (MSE)
### BEGIN SOLUTION
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
### END SOLUTION
mae, mse, r2

(50670.48923565551, 4908290571.346386, 0.6254382675296302)

In [19]:
#step5
def mycheck(yp,yt,error_percent=10):
    p=error_percent/100
    return yt*(1-p)<yp and yp < yt*(1+p)

assert  mycheck(mae,50670.489), "Incorrect Mean Absolute Error (MAE)"
assert mycheck(mse,4908290571.346), "Incorrect Mean Squared Error (MSE)"
assert mycheck(r2,0.625), "Incorrect R-squared (R2) score"