In [2]:
# CASE STUDY 03 – MACHINE LEARNING
# Prediksi Harga Mobil dengan Linear Regression

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# =======================
# Simulasi Dataset Mobil
# =======================
data = {
    'Brand': ['Toyota', 'BMW', 'Ford', 'Toyota', 'BMW', 'Ford', 'Toyota', 'BMW', 'Ford', 'Toyota'],
    'Year': [2015, 2018, 2017, 2014, 2019, 2016, 2013, 2020, 2015, 2012],
    'Transmission': ['Manual', 'Automatic', 'Manual', 'Automatic', 'Manual', 'Automatic', 'Manual', 'Automatic', 'Manual', 'Automatic'],
    'FuelType': ['Petrol', 'Diesel', 'Petrol', 'Diesel', 'Petrol', 'Diesel', 'Petrol', 'Diesel', 'Petrol', 'Diesel'],
    'Mileage': [50000, 30000, 40000, 60000, 25000, 45000, 70000, 20000, 55000, 75000],
    'EngineSize': [1.6, 2.0, 1.8, 1.6, 2.0, 1.8, 1.6, 2.0, 1.8, 1.6],
    'Price': [12000, 22000, 15000, 11000, 24000, 16000, 10000, 26000, 14000, 9000]
}
df = pd.DataFrame(data)

# =======================
# Preprocessing
# =======================
X = df.drop('Price', axis=1)
y = df['Price']

# One-hot encoding
categorical_cols = ['Brand', 'Transmission', 'FuelType']
encoder = OneHotEncoder(sparse_output=False, drop='first')
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_cols]), columns=encoder.get_feature_names_out(categorical_cols))

# Gabung dengan data numerik
X_numeric = X.drop(columns=categorical_cols).reset_index(drop=True)
X_final = pd.concat([X_numeric, X_encoded], axis=1)

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_final)

# =======================
# Split Data
# =======================
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# =======================
# Model Training
# =======================
model = LinearRegression()
model.fit(X_train, y_train)

# Prediksi
y_pred = model.predict(X_test)

# =======================
# Evaluasi Model
# =======================
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("RMSE:", rmse)
print("R² Score:", r2)

RMSE: 1521.6098600091316
R² Score: 0.8552939646201869
