# Neural Network Model using TensorFlow and Keras to perform Regression

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import MeanSquaredError

# Load the data from the CSV file
data = pd.read_csv("vehicles_dataset_from_advertisement.csv")

# Select specific columns as features and target variable
features = ['model_year', 'odometer', 'condition', 'cylinders', 'fuel', 'transmission', 'type', 'paint_color']
target = 'price'

X = data[features]  # Features
y = data[target]    # Target variable

# Data Preprocessing
# Convert categorical variables into numerical representations (e.g., one-hot encoding)
X = pd.get_dummies(X, columns=['condition', 'fuel', 'transmission', 'type', 'paint_color'])

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build the neural network model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss=MeanSquaredError())

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=10, validation_split=0.2, verbose=1)

# Evaluate the model
train_loss = model.evaluate(X_train_scaled, y_train, verbose=0)
test_loss = model.evaluate(X_test_scaled, y_test, verbose=0)

print(f"Train Loss: {train_loss}")
print(f"Test Loss: {test_loss}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m1031/1031[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 242273632.0000 - val_loss: 261659424.0000
Epoch 2/10
[1m1031/1031[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 245472096.0000 - val_loss: 256638512.0000
Epoch 3/10
[1m1031/1031[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 233595712.0000 - val_loss: 249454576.0000
Epoch 4/10
[1m1031/1031[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 222788656.0000 - val_loss: 240709248.0000
Epoch 5/10
[1m1031/1031[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 219170768.0000 - val_loss: 230782512.0000
Epoch 6/10
[1m1031/1031[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 205360720.0000 - val_loss: 219923952.0000
Epoch 7/10
[1m1031/1031[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 198633472.0000 - val_loss: 208425552.0000
Epoch 8/10
[1m1031/1031[0m [32m

# Random Forest Regression

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

# Load the data from the CSV file
data = pd.read_csv("vehicles_dataset_from_advertisement.csv")

# Select specific columns as features and target variable
features = ['model_year', 'odometer', 'condition', 'cylinders', 'fuel', 'transmission', 'type', 'paint_color']
target = 'price'

X = data[features]  # Features
y = data[target]    # Target variable

# Data Preprocessing
# Convert categorical variables into numerical representations (e.g., one-hot encoding)
X = pd.get_dummies(X, columns=['condition', 'fuel', 'transmission', 'type', 'paint_color'])

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Train the Random Forest Regression model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

# Number of trees in the Random Forest model
num_trees = len(rf_model.estimators_)

# Evaluate the model
train_loss_rf = mean_squared_error(y_train, y_train_pred)
test_loss_rf = mean_squared_error(y_test, y_test_pred)

print(f"Number of trees (estimators) in Random Forest: {num_trees}")
print(f"Train Loss (Random Forest Regression): {train_loss_rf}")
print(f"Test Loss (Random Forest Regression): {test_loss_rf}")


Number of trees (estimators) in Random Forest: 100
Train Loss (Random Forest Regression): 4853161.560541999
Test Loss (Random Forest Regression): 32926363.587604124


# Linear Regression

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

# Load the data from the CSV file
data = pd.read_csv("vehicles_dataset_from_advertisement.csv")

# Select specific columns as features and target variable
features = ['model_year', 'odometer', 'condition', 'cylinders', 'fuel', 'transmission', 'type', 'paint_color']
target = 'price'

X = data[features]  # Features
y = data[target]    # Target variable

# Data Preprocessing
# Convert categorical variables into numerical representations (e.g., one-hot encoding)
X = pd.get_dummies(X, columns=['condition', 'fuel', 'transmission', 'type', 'paint_color'])

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions
y_train_pred = lr_model.predict(X_train)
y_test_pred = lr_model.predict(X_test)

# Evaluate the model
train_loss_lr = mean_squared_error(y_train, y_train_pred)
test_loss_lr = mean_squared_error(y_test, y_test_pred)

print(f"Train Loss (Linear Regression): {train_loss_lr}")
print(f"Test Loss (Linear Regression): {test_loss_lr}")


Train Loss (Linear Regression): 50733877.99016937
Test Loss (Linear Regression): 56897539.181750946


#### Training Loss: 
This represents the error of the model on the training dataset. It is calculated as the difference between the actual target values and the predicted values of the model, typically using a loss function (e.g., mean squared error for regression problems). A lower training loss indicates that the model is fitting well to the training data.
#### Testing Loss: 
This represents the error of the model on the testing dataset, which contains data that the model has not seen during training. It is calculated in the same way as the training loss. The testing loss provides an estimate of how well the model generalizes to new, unseen data. A similar (but not necessarily identical) testing loss compared to the training loss suggests that the model is not overfitting.

# Decision Tree Regression

In [11]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
data = pd.read_csv("vehicles_dataset_from_advertisement.csv")

# Drop rows with missing values
data.dropna(inplace=True)

# Selecting features and target variable
X = data[['model_year', 'odometer', 'condition', 'cylinders', 'fuel', 'transmission', 'type', 'paint_color']]
y = data['price']

# One-hot encoding categorical variables
X = pd.get_dummies(X)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Decision Tree Regression model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Make predictions
y_train_pred = dt_model.predict(X_train)
y_test_pred = dt_model.predict(X_test)

# Calculate Mean Squared Error
train_loss = mean_squared_error(y_train, y_train_pred)
test_loss = mean_squared_error(y_test, y_test_pred)

print("Train Loss:", train_loss)
print("Test Loss:", test_loss)


Train Loss: 539748.4444241101
Test Loss: 84770266.99811314


# Gradient Boosting Regression

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
data = pd.read_csv("vehicles_dataset_from_advertisement.csv")

# Drop rows with missing values
data = data.dropna()

# Select features and target variable
X = data[['model_year', 'odometer', 'condition', 'cylinders', 'fuel', 'transmission', 'type', 'paint_color']]
y = data['price']

# Perform one-hot encoding for categorical features
X = pd.get_dummies(X)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Instantiate the Gradient Boosting Regression model
gb_model = GradientBoostingRegressor(random_state=42)

# Train the model
gb_model.fit(X_train, y_train)

# Make predictions
y_train_pred = gb_model.predict(X_train)
y_test_pred = gb_model.predict(X_test)

# Calculate Mean Squared Error
train_loss = mean_squared_error(y_train, y_train_pred)
test_loss = mean_squared_error(y_test, y_test_pred)

print("Train Loss:", train_loss)
print("Test Loss:", test_loss)


Train Loss: 40811233.93412305
Test Loss: 39573611.107459165


# Support Vector Regression


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

# Load the dataset
data = pd.read_csv("vehicles_dataset_from_advertisement.csv")

# Drop rows with missing values
data = data.dropna()

# Select features and target variable
X = data[['model_year', 'odometer', 'condition', 'cylinders', 'fuel', 'transmission', 'type', 'paint_color']]
y = data['price']

# Perform one-hot encoding for categorical features
X = pd.get_dummies(X)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Instantiate the SVR model
svr_model = SVR()

# Train the model
svr_model.fit(X_train, y_train)

# Make predictions
y_train_pred = svr_model.predict(X_train)
y_test_pred = svr_model.predict(X_test)

# Calculate Mean Squared Error
train_loss = mean_squared_error(y_train, y_train_pred)
test_loss = mean_squared_error(y_test, y_test_pred)

print("Train Loss:", train_loss)
print("Test Loss:", test_loss)


Train Loss: 129060095.99088883
Test Loss: 133831685.37310076
