In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from category_encoders import MEstimateEncoder
from sklearn.preprocessing import MinMaxScaler,PolynomialFeatures
import regression as reg
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.svm import SVR 

In [2]:
df = pd.read_csv('data/cleaned_cars.csv')
df = pd.get_dummies(df, columns = ['country'], drop_first=True, dtype=np.int8)
df.head()

Unnamed: 0,engine_capacity,cylinder,horse_power,top_speed,seats,brand,price,year,country_egypt,country_ksa,country_kuwait,country_oman,country_qatar,country_uae
0,2.0,4,180,205.0,8,peugeot,37955.25,2021,0,1,0,0,0,0
1,1.5,4,102,145.0,4,suzuki,26671.95,2021,0,1,0,0,0,0
2,2.3,4,420,173.0,4,ford,53460.0,2021,0,1,0,0,0,0
3,1.8,4,140,190.0,5,honda,28179.975,2021,0,1,0,0,0,0
4,1.8,4,140,190.0,5,honda,25740.45,2021,0,1,0,0,0,0


## Target encoding

In [None]:
X = df.copy()
y = X.pop('price')

X_encode = X.sample(frac=0.2, random_state=42)
y_encode = y[X_encode.index]
X = X.drop(X_encode.index)
y = y[X.index]

# Create the encoder instance. Choose m to control noise.
encoder = MEstimateEncoder(cols=["brand"], m=5.0)

# Fit the encoder on the encoding split.
encoder.fit(X_encode, y_encode)

# Encode the Zipcode column to create the final training data
X = encoder.transform(X)
X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)  

## Frequency Encoding with top_speed as the target variable

In [3]:
X = df.copy()
y = X.pop('top_speed')

X_encode = X.sample(frac=0.2, random_state=42)
y_encode = y[X_encode.index]

X = X.drop(X_encode.index)
y = y[X.index]

brand_counts = X_encode['brand'].value_counts()  
X['brand'] = X['brand'].map(brand_counts) 


X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

X.dropna(inplace=True)
y = y[X.index]


engine_capacity    0
cylinder           0
horse_power        0
seats              0
brand              0
price              0
year               0
country_egypt      0
country_ksa        0
country_kuwait     0
country_oman       0
country_qatar      0
country_uae        0
dtype: int64

In [None]:
X.head()

In [None]:
y.head()

In [4]:
# Split the dataset into training (60%) and temporary (40%) sets with a fixed random_state
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Split the temporary set into validation (50% of temporary, i.e., 20% of total) and test (50% of temporary, i.e., 20% of total) sets with the same random_state
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [5]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [None]:
# scaler = MinMaxScaler()
# columns_to_scale = [col for col in X_train.columns if col != 'brand']
# X_train_scaled = X_train[columns_to_scale]
# X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
# X_train_scaled = pd.concat([X_train_scaled, X_train[['brand']]], axis=1)
# X_val[columns_to_scale] = scaler.transform(X_val[columns_to_scale])
# X_val_scaled = pd.concat([X_val[columns_to_scale], X_val[['brand']]], axis=1)
# X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])
# X_test_scaled = pd.concat([X_test[columns_to_scale], X_test[['brand']]], axis=1)

In [None]:
# Print the sizes of the resulting datasets
print(f"Training set size: {(X_train_scaled.shape[0])}")
print(f"Validation set size: {(X_val_scaled.shape[0])}")
print(f"Test set size: {(X_test_scaled.shape[0])}")

# Optionally, print the datasets
print("\nTraining Set:")
X_train_scaled
df_ok = pd.DataFrame(X_train_scaled, columns=X_train.columns)
df_ok.head()    

## No API part

In [None]:
w = reg.linear_regression_fit(X_train_scaled, y_train)
y_predict = reg.linear_regression_predict(X_test_scaled, w)


In [None]:
# Calculate mean squared error
mse = mean_squared_error(y_predict, y_test)
print(f"Mean Squared Error: {mse}")

In [None]:
num_iterations = 1000
w, errors = reg.gradient_descent(X_train_scaled, y_train, 0.01, num_iterations)
print(w.shape)
y_predict = reg.linear_regression_predict(X_test_scaled, w)
mean_squared_error(y_predict, y_test)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(range(num_iterations), errors)
plt.xlabel('Iteration')
plt.ylabel('Error')
plt.title('Gradient Descent Error')
plt.legend()
plt.show()

## Lasso Implementation(API)

In [6]:
lasso = Lasso()
lasso.fit(X_train_scaled, y_train)
y_train_pred = lasso.predict(X_train_scaled)
mae = mean_absolute_error(y_train, y_train_pred)
mse = mean_squared_error(y_train, y_train_pred)
r2 = r2_score(y_train,y_train_pred)
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

Mean Absolute Error: 28.071106360081934
Mean Squared Error: 1267.7953113010064
R2 Score: 0.35394121882106466


In [7]:
y_val_pred = lasso.predict(X_val_scaled)
mae = mean_absolute_error(y_val, y_val_pred)
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val,y_val_pred)
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")


Mean Absolute Error: 28.979087600774026
Mean Squared Error: 1324.5781105079673
R2 Score: 0.3484346104564914


In [8]:
# Hyperparamater Values
param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100]
}

In [9]:
#Using Grid Search to find the optimal value for alpha

lasso_cv = GridSearchCV(lasso,param_grid,cv = 5,n_jobs = -1)
lasso_cv.fit(X_train_scaled, y_train)

In [10]:
#Testing Lasso on the validation set
y_val_pred = lasso_cv.predict(X_val_scaled)
mae_lasso_val = mean_absolute_error(y_val, y_val_pred)
mse_lasso_val = mean_squared_error(y_val, y_val_pred)
r2_lasso_val = r2_score(y_val, y_val_pred) 
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

Mean Absolute Error: 28.979087600774026
Mean Squared Error: 1324.5781105079673
R2 Score: 0.3484346104564914


In [None]:
y_test_pred = lasso_cv.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

## Ridge Implementation (API)

In [None]:
ridge = Ridge()
ridge.fit(X_train_scaled, y_train)

In [None]:
#Using Grid Search to find the optimal value for alpha

ridge_cv = GridSearchCV(ridge,param_grid,cv = 5,n_jobs = -1)
ridge_cv.fit(X_train_scaled, y_train)

In [None]:
#Testing Ridge on the validation set
y_val_pred = ridge_cv.predict(X_val_scaled)
mae = mean_absolute_error(y_val, y_val_pred)
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred) 
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

In [None]:
y_test_pred = ridge_cv.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

## Linear Regression

In [None]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_val_pred = model.predict(X_val_scaled)
mae_lin_val = mean_absolute_error(y_val, y_val_pred)
mse_lin_val = mean_squared_error(y_val, y_val_pred)
r2_lin_val = r2_score(y_val, y_val_pred)
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

In [None]:
y_test_pred = model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

    

In [None]:
ratio = ((mse_lin_val / mse_lasso_val))
print(f"Ratio of MSE: {ratio}")

## Polynomial Regression

In [None]:
max_degree = 5
n = len(y_train)
k = X_train_scaled.shape[1]

In [None]:
bic_scores = []  # Ensure this list is empty at the start
degrees = range(2, max_degree + 1)

print("Degree | MSE         | MAE         | R2         | BIC")
print("-" * 50)

for degree in degrees:
    # Generate polynomial features
    poly = PolynomialFeatures(degree=degree)
    x_train_poly = poly.fit_transform(X_train_scaled)
    x_val_poly = poly.fit_transform(X_val_scaled)

    # Fit the model
    model = LinearRegression()
    model.fit(x_train_poly, y_train)

    # Predictions
    y_val_pred = model.predict(x_val_poly)

    # Calculate metrics
    mse = mean_squared_error(y_val, y_val_pred)
    mae = mean_absolute_error(y_val, y_val_pred)
    r2 = r2_score(y_val, y_val_pred)

    # Calculate RSS and BIC
    rss = np.sum((y_val - y_val_pred) ** 2)
    n = len(y_val)  # Number of samples in validation set
    k = x_val_poly.shape[1]  # Number of parameters (including intercept)
    bic = k * np.log(rss / n) + k * np.log(n)
    bic_scores.append(bic)

    # Print metrics for the current degree
    print(f"{degree:6} | {mse:10.6f} | {mae:10.6f} | {r2:10.6f} | {bic:10.6f}")

# Ensure lengths of degrees and bic_scores are the same
if len(degrees) != len(bic_scores):
    print(f"Mismatch: degrees({len(degrees)}), bic_scores({len(bic_scores)})")

# Plot
plt.figure(figsize=(10, 6))
plt.plot(degrees, bic_scores, marker='o', label='BIC Score')
plt.xlabel('Polynomial Degree')
plt.ylabel('BIC Score')
plt.title('Polynomial Degree vs. BIC Score')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
poly = PolynomialFeatures(degree=2)    
x_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.fit_transform(X_test_scaled)

# Fit the model
model = LinearRegression()
model.fit(x_train_poly, y_train)

    # Predictions
y_test_pred = model.predict(X_test_poly)

    # Calculate metrics
mse = mean_squared_error(y_test, y_test_pred)
mae = mean_absolute_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

In [None]:
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

## Forward Feature Selection

In [None]:
sfs = SequentialFeatureSelector(lasso_cv,n_features_to_select="auto",direction='forward')
sfs.fit(X_train_scaled,y_train)
X_train_selected = sfs.transform(X_train_scaled)
X_val_selected = sfs.transform(X_val_scaled)
lasso_cv.fit(X_train_selected,y_train)
y_val_pred = lasso_cv.predict(X_val_selected)

mae = mean_absolute_error(y_val, y_val_pred)
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")


In [None]:
X_test_selected = sfs.transform(X_test_scaled)
y_test_pred = lasso_cv.predict(X_test_selected)
mae = mean_absolute_error(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")


In [None]:
def forward_feature_selection_with_validation(X_train, y_train, X_val, y_val,model,scoring='mse', max_features=None):

    selected_features = []
    remaining_features = list(range(X_train.shape[1]))
    validation_scores = []

    if max_features is None:
        max_features = X_train.shape[1]

    best_score = float('inf') if scoring in ['mse', 'mae'] else float('-inf')

    for _ in range(max_features):
        best_feature = None
        best_feature_score = best_score

        for feature in remaining_features:
            features_to_test = selected_features + [feature]
            X_train_subset = X_train[:, features_to_test]
            X_val_subset = X_val[:, features_to_test]

            model.fit(X_train_subset, y_train)

            y_val_pred = model.predict(X_val_subset)

            if scoring == 'mse':
                score = mean_squared_error(y_val, y_val_pred)
                is_better = score < best_feature_score  
            elif scoring == 'mae':
                score = mean_absolute_error(y_val, y_val_pred)
                is_better = score < best_feature_score
            elif scoring == 'r2':
                score = model.score(X_val_subset, y_val)
                is_better = score > best_feature_score  
            else:
                raise ValueError("Unsupported scoring metric. Use 'mse' or 'r2'.")

            if is_better:
                best_feature_score = score
                best_feature = feature

        if best_feature is not None:
            selected_features.append(best_feature)
            validation_scores.append(best_feature_score)
            remaining_features.remove(best_feature)
            best_score = best_feature_score 
        else:
            break  

    return selected_features, validation_scores

In [None]:
selected_features, validation_scores = forward_feature_selection_with_validation(
    X_train_scaled, y_train, X_val_scaled, y_val, scoring='mae', max_features=None,model=lasso_cv
)

# Output the results
print("Selected Features:", selected_features)
print("Validation Scores:", validation_scores)

## Support Vector Regression (RBF Kernel)

In [None]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001]
              }  

grid_search = GridSearchCV(SVR(kernel='rbf'), param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

# Get best model from GridSearchCV
svr_best = grid_search.best_estimator_

# Predict using the best model
y_val_pred = svr_best.predict(X_val_scaled)

# Evaluation metrics
mae = mean_absolute_error(y_val, y_val_pred)
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")