In [10]:
# File for machine learning model - Random Forest Regression


import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

dataname = "randomforest_regression"

DATA_PATH = "../../Datasets/Vehice dataset/Downsampled/Test/test_sampled.csv"
OUTPUT_PATH = f"../../Datasets/Evaluations/Regression/{dataname}.csv"

TEST_PATH = "../../Datasets/Vehice dataset/Downsampled/Test/test_sampled.csv"
TRAIN_PATH = "../../Datasets/Vehice dataset/Downsampled/Train/train.csv"
VALID_PATH = "../../Datasets/Vehice dataset/Downsampled/Valid/valid.csv"




In [11]:
df_samples = pd.read_csv(DATA_PATH)
df_test = pd.read_csv(DATA_PATH)
df_train = pd.read_csv(TRAIN_PATH)

df_test.drop(columns=['prediction', 'prompt'], inplace=True)
df_samples.drop(columns=['prediction', 'prompt'], inplace=True)
print(df_samples.head())

                                    name  year  selling_price  km_driven  \
0     Ford Figo Aspire 1.5 TDCi Titanium  2017         670000      70000   
1  Mahindra Scorpio VLX 2WD AIRBAG BSIII  2012         525000     120000   
2                 Maruti Swift Dzire VDI  2014         438999      81000   
3              Ford Figo Diesel Titanium  2010         144000      50000   
4                 Hyundai i10 Magna 1.1L  2008         185000     110000   

     fuel seller_type transmission                 owner     mileage   engine  \
0  Diesel  Individual       Manual           First Owner  25.83 kmpl  1498 CC   
1  Diesel  Individual       Manual           First Owner  12.05 kmpl  2179 CC   
2  Diesel      Dealer       Manual           First Owner   23.4 kmpl  1248 CC   
3  Diesel  Individual       Manual          Second Owner   20.0 kmpl  1399 CC   
4  Petrol  Individual       Manual  Fourth & Above Owner  19.81 kmpl  1086 CC   

   max_power               torque  seats  
0     99 bhp 

In [12]:

df_samples = df_samples.dropna(how='any')

df_samples['prediction'] = None
df_samples["prompt"] = f'{dataname}'
df_samples["prompt_name"] = f'{dataname}'
df_samples["model_name"] = f'{dataname}'


In [13]:
def pre_process(df):
    df['year'] = 2020 - df['year']
    df['mileage'] = df['mileage'].str.replace(' kmpl', '', regex=False)
    df['mileage'] = df['mileage'].str.replace(' km/kg', '', regex=False)
    df['mileage'] = pd.to_numeric(df['mileage'])

    df['engine'] = df['engine'].str.replace(' CC', '', regex=False)
    df['engine'] = pd.to_numeric(df['engine'])


    df['max_power'] = df['max_power'].str.replace(' bhp', '', regex=False)
    df['max_power'] = pd.to_numeric(df['max_power'])
    
    df = df.dropna(how='any')
    return df

df_train = pre_process(df_train)
df_test = pre_process(df_test)

print(df_samples.head())

                                    name  year  selling_price  km_driven  \
0     Ford Figo Aspire 1.5 TDCi Titanium  2017         670000      70000   
1  Mahindra Scorpio VLX 2WD AIRBAG BSIII  2012         525000     120000   
2                 Maruti Swift Dzire VDI  2014         438999      81000   
3              Ford Figo Diesel Titanium  2010         144000      50000   
4                 Hyundai i10 Magna 1.1L  2008         185000     110000   

     fuel seller_type transmission                 owner     mileage   engine  \
0  Diesel  Individual       Manual           First Owner  25.83 kmpl  1498 CC   
1  Diesel  Individual       Manual           First Owner  12.05 kmpl  2179 CC   
2  Diesel      Dealer       Manual           First Owner   23.4 kmpl  1248 CC   
3  Diesel  Individual       Manual          Second Owner   20.0 kmpl  1399 CC   
4  Petrol  Individual       Manual  Fourth & Above Owner  19.81 kmpl  1086 CC   

   max_power               torque  seats prediction     

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

X_train = df_train.drop('selling_price', axis=1)
y_train = df_train['selling_price']
X_test = df_test.drop('selling_price', axis=1)
y_test = df_test['selling_price']

categorical_features = ['name', 'fuel', 'seller_type', 'transmission', 'owner', 'torque']

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_features)],
    remainder='passthrough' 
)

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

model = RandomForestRegressor(random_state=42, n_estimators=100)
model.fit(X_train_transformed, y_train)

y_train_pred = model.predict(X_train_transformed)
y_test_pred = model.predict(X_test_transformed)

train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"Training RMSE: {train_rmse}")
print(f"Test RMSE: {test_rmse}")
print(f"Training R^2: {train_r2}")
print(f"Test R^2: {test_r2}")



Training RMSE: 57999.56384305948
Test RMSE: 122177.78364181585
Training R^2: 0.9949433445000825
Test R^2: 0.9715010165973164




In [15]:
df_samples['prediction'] = y_test_pred

In [16]:
print(df_samples.head())

                                    name  year  selling_price  km_driven  \
0     Ford Figo Aspire 1.5 TDCi Titanium  2017         670000      70000   
1  Mahindra Scorpio VLX 2WD AIRBAG BSIII  2012         525000     120000   
2                 Maruti Swift Dzire VDI  2014         438999      81000   
3              Ford Figo Diesel Titanium  2010         144000      50000   
4                 Hyundai i10 Magna 1.1L  2008         185000     110000   

     fuel seller_type transmission                 owner     mileage   engine  \
0  Diesel  Individual       Manual           First Owner  25.83 kmpl  1498 CC   
1  Diesel  Individual       Manual           First Owner  12.05 kmpl  2179 CC   
2  Diesel      Dealer       Manual           First Owner   23.4 kmpl  1248 CC   
3  Diesel  Individual       Manual          Second Owner   20.0 kmpl  1399 CC   
4  Petrol  Individual       Manual  Fourth & Above Owner  19.81 kmpl  1086 CC   

   max_power               torque  seats  prediction    

In [17]:
df_samples.to_csv(OUTPUT_PATH, index=False)
