<a href="https://colab.research.google.com/github/ulmkat/ulmkat.github.io/blob/main/Heinin_data_lightgbm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import pandas as pd

# Print the current working directory
print("Current working directory:", os.getcwd())

# List all files in the current working directory
print("Files in current directory:", os.listdir())

# If the file is in the current directory, this should work
file_path = "prepro_data_heini.csv"
try:
    df = pd.read_csv(file_path)
    df.head(2)
    print("File successfully read!")
except FileNotFoundError:
    print("File not found in the current directory. Please provide the correct path.")

# If the file is in a different directory, provide the full path:
# Replace this with the actual path to your file
file_path = "prepro_data_heini.csv"  # Example: "/home/user/data/prepro_data_heini.csv"
df = pd.read_csv(file_path)
df.head(2)

Current working directory: /content
Files in current directory: ['.config', 'prepro_data_heini.csv', 'sample_data']
File successfully read!


Unnamed: 0,body_type,city_fuel_economy,daysonmarket,engine_cylinders,engine_displacement,frame_damaged,franchise_make,front_legroom,fuel_tank_volume,fuel_type,...,Convenience Package,Bluetooth,Sunroof/Moonroof,Adaptive Cruise Control,Android Auto,Navigation System,Heated Seats,Third Row Seating,CarPlay,Blind Spot Monitoring
0,6,27.0,55,6,1500.0,0.0,8,42.0,15.8,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,21.0,76,6,2400.0,0.0,8,41.2,18.8,5,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import lightgbm as lgb

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [3]:
X = df.drop(columns=['price', 'daysonmarket'])
y = df['daysonmarket']

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, random_state=2023)

X_train.shape, X_test.shape

((119976, 49), (39993, 49))

In [4]:
# Use StandardScaler to scale the training and validation data
scaler = StandardScaler()
#Fit the StandardScaler to the training data
scaler.fit(X_train)
# transform both the training and validation data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


In [5]:
# Create a LightGBM dataset for training with features X_train and labels Y_train
train_data = lgb.Dataset(X_train, label=y_train)

# Create a LightGBM dataset for testing with features X_val and labels Y_val,
# and specify the reference dataset as train_data for consistent evaluation
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [6]:
# Define a dictionary of parameters for configuring the LightGBM regression model.
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
}


In [8]:
# Set the number of rounds and train the model with early stopping
num_round = 100
bst = lgb.train(params, train_data, num_round, valid_sets=[test_data],
                callbacks=[lgb.early_stopping(stopping_rounds=10)]) # Pass early stopping as a callback

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.104663 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2829
[LightGBM] [Info] Number of data points in the train set: 119976, number of used features: 46
[LightGBM] [Info] Start training from score 43.236072
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's rmse: 35.7886


In [15]:
# Import necessary libraries for calculating mean squared error and using the LightGBM regressor.
from sklearn.metrics import mean_squared_error as mse
from lightgbm import LGBMRegressor

# Create an instance of the LightGBM Regressor with the RMSE metric.
model = LGBMRegressor(metric='rmse')

# Train the model using the training data.
model.fit(X_train, y_train)

# Make predictions on the training and validation data.
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037496 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2829
[LightGBM] [Info] Number of data points in the train set: 119976, number of used features: 46
[LightGBM] [Info] Start training from score 43.236117


In [16]:
# Calculate and print the Root Mean Squared Error (RMSE) for training and validation predictions.
print("Training RMSE: ", np.sqrt(mse(y_train, train_pred)))
print("Validation RMSE: ", np.sqrt(mse(y_test, test_pred)))

Training RMSE:  3.33922535585517
Validation RMSE:  3.242094742761743


In [19]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, max_error

mae_2 = mean_absolute_error(y_test, test_pred)

# Root mean squared Error
mse_2 = np.sqrt(mean_squared_error(y_test, test_pred))

# Explained variance
exp_var_2 = explained_variance_score(y_test, test_pred)

# R^2
r2_2 = r2_score(y_test, test_pred)

# Maximum error
max_err_2 = max_error(y_test, test_pred)

print(mae_2)

print(f'Mean absolute error: {mae_2:.3f} $')
print(f'Root mean squared error: {mse_2:.3f} $')
print(f'Explained variance: {exp_var_2:.3f}')
print(f'R^2: {r2_2:.3f}')
print(f'Maximum error: {max_err_2:.3f} $')

2.2218405393715797
Mean absolute error: 2.222 $
Root mean squared error: 3.242 $
Explained variance: 0.971
R^2: 0.971
Maximum error: 35.549 $
