In [28]:
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error
from sklearn.impute import SimpleImputer

In [29]:
file_path = "test.csv"
df = pd.read_csv(file_path)

In [30]:
numeric_columns_with_missing = ['SalesID', 'MachineID', 'ModelID', 'datasource','auctioneerID', 'YearMade', 'MachineHoursCurrentMeter']
categorical_columns_with_missing = ['UsageBand', 'fiModelDesc', 'fiBaseModel',
       'fiSecondaryDesc', 'fiModelSeries', 'fiModelDescriptor', 'ProductSize',
       'fiProductClassDesc', 'state', 'ProductGroup', 'ProductGroupDesc',
       'Drive_System', 'Enclosure', 'Forks', 'Pad_Type', 'Ride_Control',
       'Stick', 'Transmission', 'Turbocharged', 'Blade_Extension',
       'Blade_Width', 'Enclosure_Type', 'Engine_Horsepower', 'Hydraulics',
       'Pushblock', 'Ripper', 'Scarifier', 'Tip_Control', 'Tire_Size',
       'Coupler', 'Coupler_System', 'Grouser_Tracks', 'Hydraulics_Flow',
       'Track_Type', 'Undercarriage_Pad_Width', 'Stick_Length', 'Thumb',
       'Pattern_Changer', 'Grouser_Type', 'Backhoe_Mounting', 'Blade_Type',
       'Travel_Controls', 'Differential_Type', 'Steering_Controls']
# Numeric columns imputation with mean
numeric_imputer = SimpleImputer(strategy='mean')
df[numeric_columns_with_missing] = numeric_imputer.fit_transform(df[numeric_columns_with_missing])
# Categorical columns imputation with most frequent or missing
categorical_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_columns_with_missing] = categorical_imputer.fit_transform(df[categorical_columns_with_missing])

In [31]:
df['saledate'] = pd.to_datetime(df['saledate'])
df['sale_year'] = df['saledate'].dt.year
df['sale_month'] = df['saledate'].dt.month
df['sale_day'] = df['saledate'].dt.day
df['sale_dayofweek'] = df['saledate'].dt.dayofweek
df = df.drop(['saledate'], axis=1)

In [32]:
features_to_encode = ['UsageBand', 'fiModelDesc', 'fiBaseModel',
       'fiSecondaryDesc', 'fiModelSeries', 'fiModelDescriptor', 'ProductSize',
       'fiProductClassDesc', 'state', 'ProductGroup', 'ProductGroupDesc',
       'Drive_System', 'Enclosure', 'Forks', 'Pad_Type', 'Ride_Control',
       'Stick', 'Transmission', 'Turbocharged', 'Blade_Extension',
       'Blade_Width', 'Enclosure_Type', 'Engine_Horsepower', 'Hydraulics',
       'Pushblock', 'Ripper', 'Scarifier', 'Tip_Control', 'Tire_Size',
       'Coupler', 'Coupler_System', 'Grouser_Tracks', 'Hydraulics_Flow',
       'Track_Type', 'Undercarriage_Pad_Width', 'Stick_Length', 'Thumb',
       'Pattern_Changer', 'Grouser_Type', 'Backhoe_Mounting', 'Blade_Type',
       'Travel_Controls', 'Differential_Type', 'Steering_Controls']
for column in features_to_encode:
        unique_types = df[column].apply(type).unique()
        if len(unique_types) > 1:
            print(f"Column '{column}' has mixed data types: {unique_types}")
            
            # Handle the mixed data types, for example, convert to numeric or handle separately
            # For simplicity, let's convert the entire column to strings
            df[column] = df[column].astype(str)
            

In [33]:
features_to_encode = ['UsageBand', 'fiModelDesc', 'fiBaseModel',
       'fiSecondaryDesc', 'fiModelSeries', 'fiModelDescriptor', 'ProductSize',
       'fiProductClassDesc', 'state', 'ProductGroup', 'ProductGroupDesc',
       'Drive_System', 'Enclosure', 'Forks', 'Pad_Type', 'Ride_Control',
       'Stick', 'Transmission', 'Turbocharged', 'Blade_Extension',
       'Blade_Width', 'Enclosure_Type', 'Engine_Horsepower', 'Hydraulics',
       'Pushblock', 'Ripper', 'Scarifier', 'Tip_Control', 'Tire_Size',
       'Coupler', 'Coupler_System', 'Grouser_Tracks', 'Hydraulics_Flow',
       'Track_Type', 'Undercarriage_Pad_Width', 'Stick_Length', 'Thumb',
       'Pattern_Changer', 'Grouser_Type', 'Backhoe_Mounting', 'Blade_Type',
       'Travel_Controls', 'Differential_Type', 'Steering_Controls']
label_encoder = LabelEncoder()
for feature in features_to_encode:
    df[feature + '_label_encoded'] = label_encoder.fit_transform(df[feature])

In [34]:
df = df.drop(features_to_encode, axis=1)

In [35]:
# Load the trained model from the file
loaded_model = joblib.load('trained_model.pkl')


In [36]:
benchmark = pd.read_csv("random_forest_benchmark_test.csv")
y_bench = benchmark["SalePrice"]

In [37]:
# Predict on the test set
y_pred = loaded_model.predict(df)
# Evaluate the model
rmsle_score = mean_squared_log_error(y_bench, y_pred) ** 0.5
rmsle_score

0.46678797077379036