##Set-Up

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
path = "drive/MyDrive/Methoden: Big Data und Data Science/"

###Import Libraries

In [28]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
from sklearn.ensemble import GradientBoostingRegressor

###Load and Prepare Data

In [23]:
data = pd.read_excel(path + "carsales_data_prediction.xlsx")

In [24]:
# Define the categorical columns
categorical_columns = ['Marke', 'Modell', 'color', 'interior_color', 'interior']

In [25]:
# Perform one-hot encoding on the categorical variables
data_one_hot_encoded = pd.get_dummies(data, columns=categorical_columns, dtype = int)

In [26]:
y = np.log(data_one_hot_encoded["price"])
X = data_one_hot_encoded.drop("price", axis=1)

##Gradient Boosting Regressor

In [70]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [83]:
carsale_gb = GradientBoostingRegressor(random_state=42,max_depth=5, n_estimators=199, learning_rate=0.7011150117432088)
carsale_gb.fit(X,y)

In [84]:
y_pred_gb = carsale_gb.predict(X_test)

In [85]:
data_gb = data.copy()
data_gb["value_difference_gb"] = np.exp(y_test) - np.exp(y_pred_gb)

In [86]:
data_gb["value_difference_gb"].min()

-25767.285879678544

In [87]:
data_gb["value_difference_gb"].max()

39291.650572152095

In [88]:
r2_test = r2_score(y_test, y_pred_gb)
mse_test = mean_squared_error(y_test, y_pred_gb)

print("Test R2 Score:", r2_test)
print("Test MSE Score:", mse_test)

Test R2 Score: 0.9842281467409936
Test MSE Score: 0.004990909701513964


###Save Model

In [89]:
import pickle

In [90]:
filename_gb_regressor = 'carsale_gb_regressor.sav'

In [91]:
pickle.dump(carsale_gb, open(filename_gb_regressor, 'wb'))

##Load Model and Score New Data

In [92]:
data_predicted = pd.get_dummies(data, columns=categorical_columns, dtype = int)
data_predicted = data_predicted.drop("price", axis=1)

In [93]:
data_predicted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24587 entries, 0 to 24586
Columns: 209 entries, warranty to interior_Vollleder
dtypes: float64(2), int64(207)
memory usage: 39.2 MB


In [94]:
data_predicted.to_csv(path+"data_predicted.csv",index=False)

In [95]:
loaded_gb = pickle.load(open(filename_gb_regressor, 'rb'))

In [96]:
loaded_gb.predict(data_predicted)

array([11.38349447, 11.80876132, 10.72341138, ..., 10.70380657,
       10.11274809, 10.11274809])