# Template Skrip Untuk Eksekusi

## 1. Load Library

In [None]:
import time
from datetime import datetime

import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

## 2. Load Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_train_data = pd.read_csv('/content/drive/MyDrive/dataset/analisis_prediktif/UTS/train_data.csv')
df_test_data = pd.read_csv('/content/drive/MyDrive/dataset/analisis_prediktif/UTS/public_test_data.csv')

## 3. Definisi Fungsi Prediksi

In [None]:
def prediksi(train_df, test_df):
    start_time = time.time()

    # Step 1: Drop initial unnecessary columns
    train_df.drop(columns=['street', 'city', 'yr_renovated'], axis=1, inplace=True)
    test_df.drop(columns=['street', 'city', 'yr_renovated'], axis=1, inplace=True)

    # Step 2: Fill zero values in 'price' column with the median
    def fill_zero_with_median(df, column_name='price'):
        median_value = df[df[column_name] != 0][column_name].median()
        df[column_name] = df[column_name].replace(0, median_value)
        return df

    train_df = fill_zero_with_median(train_df, 'price')

    # Step 3: Remove anomalies where both bathrooms and bedrooms are 0
    def remove_anomalies(df):
        df.drop(df[(df['bathrooms'] == 0) & (df['bedrooms'] == 0)].index, inplace=True)

    remove_anomalies(train_df)

    # Step 4: Add house_age feature
    def house_age(df):
        current_year = datetime.now().year
        df['house_age'] = current_year - df['yr_built']
        return df

    train_df = house_age(train_df)
    test_df = house_age(test_df)

    # Step 5: Add lot_living_ratio feature
    def add_lot_living_ratio(df, lot_column='sqft_lot', living_column='sqft_living'):
        df['lot_living_ratio'] = df.apply(
            lambda row: row[living_column] / row[lot_column] if row[lot_column] != 0 else None,
            axis=1)
        return df

    train_df = add_lot_living_ratio(train_df)
    test_df = add_lot_living_ratio(test_df)

    # Step 6: Add quality_index feature
    def add_quality_index_column(df):
        df['quality_index'] = df['condition'] * df['view']
        return df

    train_df = add_quality_index_column(train_df)
    test_df = add_quality_index_column(test_df)

    # Step 7: One-hot encode the 'statezip' column and sync train and test sets
    def one_hot_encoding_sync(train_data, test_data, column_name='statezip'):
        encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
        train_encoded = encoder.fit_transform(train_data[[column_name]])
        encoded_columns = encoder.get_feature_names_out([column_name])
        train_encoded_df = pd.DataFrame(train_encoded, columns=encoded_columns)
        train_data = train_data.drop(columns=[column_name]).reset_index(drop=True)
        train_data = train_data.join(train_encoded_df)

        test_encoded = encoder.transform(test_data[[column_name]])
        test_encoded_df = pd.DataFrame(test_encoded, columns=encoded_columns)
        test_data = test_data.drop(columns=[column_name]).reset_index(drop=True)
        test_data = test_data.join(test_encoded_df)

        return train_data, test_data

    train_df, test_df = one_hot_encoding_sync(train_df, test_df, 'statezip')

    # Step 8: Adjust 'view' column by adding 1 to each value
    def adjust_view_column(df, column_name='view'):
        df[column_name] = df[column_name] + 1
        return df

    train_df = adjust_view_column(train_df, 'view')
    test_df = adjust_view_column(test_df, 'view')

    # Step 9: Drop columns as specified and split into X and y sets
    train_df = train_df.drop(columns=['condition', 'sqft_living', 'yr_built'])
    test_df = test_df.drop(columns=['condition', 'sqft_living', 'yr_built'])

    X_train = train_df.drop(columns=['price'])
    y_train = train_df['price']
    X_test = test_df.drop(columns=['price'])
    y_test = test_df['price']

    model = XGBRegressor(random_state=0, n_estimators=178, learning_rate= 0.1589, max_depth=3)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)


    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    # Calculate execution time
    time.sleep(3)  # Simulated delay
    end_time = time.time()
    execution_time = end_time - start_time

    return {"execution_time": execution_time, "RMSE": rmse}

## 4. Jalankan

In [None]:
prediksi(df_train_data,df_test_data)



{'execution_time': 3.3209104537963867, 'RMSE': 176534.30426220005}