In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

df = pd.read_csv(r"C:\Users\jaden\Downloads\used_cars.csv") 





In [None]:
df['milage'] = df['milage'].str.replace(",", "", regex=False)
df['milage'] = df['milage'].str.replace(r"\s*mi\.?", "", regex=True)
df['milage'] = pd.to_numeric(df['milage'], errors='coerce')


In [None]:
df['price'] = df['price'].str.replace("$", "", regex=False)
df['price'] = df['price'].str.replace(",", "", regex=False)
df['price'] = pd.to_numeric(df['price'], errors='coerce')

In [None]:
df['model_year'] = pd.to_numeric(df['model_year'], errors='coerce')

In [None]:
df.dropna(subset=['milage', 'model_year', 'price'], inplace=True)
print("Data shape after dropping NaNs:", df.shape)

In [None]:
top_brands = df['brand'].value_counts().nlargest(10).index
df.loc[~df['brand'].isin(top_brands), 'brand'] = 'Other'

In [None]:
if 'engine' in df.columns:
    df.drop(columns=['engine'], inplace=True)

In [None]:
categorical_cols = ['brand', 'fuel_type', 'clean_title', 'transmission', 'ext_col', 'int_col']
cols_to_encode = [col for col in categorical_cols if col in df.columns]
df = pd.get_dummies(df, columns=cols_to_encode, drop_first=False)
print("Data shape after one-hot encoding:", df.shape)

In [None]:
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
multiplier = 1.5
lower_bound = Q1 - multiplier * IQR
upper_bound = Q3 + multiplier * IQR

df = df[(df['price'] >= lower_bound) & (df['price'] <= upper_bound)]
print("Data shape after removing outliers:", df.shape)

In [None]:
y = df['price']


excluded_cols = ['price', 'model', 'accident']
feature_cols = [col for col in df.columns if col not in excluded_cols]

X = df[feature_cols]
print("Final feature count:", len(feature_cols))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
ridge_reg = Ridge(alpha=1.0) 
ridge_reg.fit(X_train_scaled, y_train)

y_pred_ridge = ridge_reg.predict(X_test_scaled)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
rmse_ridge = np.sqrt(mse_ridge)
print(f"Ridge Regression RMSE: {rmse_ridge:.2f}")

plt.figure(figsize=(8,6))
plt.scatter(X_test['milage'], y_test, alpha=0.5, label='Actual Price')
plt.scatter(X_test['milage'], y_pred_ridge, alpha=0.5, label='Predicted (Ridge)')
plt.xlabel('Mileage')
plt.ylabel('Price')
plt.title('Ridge Regression: Actual vs. Predicted')
plt.legend()
plt.show()

In [None]:
knn_reg = KNeighborsRegressor(n_neighbors=5)
knn_reg.fit(X_train_scaled, y_train)

y_pred_knn = knn_reg.predict(X_test_scaled)
mse_knn = mean_squared_error(y_test, y_pred_knn)
rmse_knn = np.sqrt(mse_knn)
print(f"KNN Regression RMSE: {rmse_knn:.2f}")

# Visualize
plt.figure(figsize=(8,6))
plt.scatter(X_test['milage'], y_test, alpha=0.5, label='Actual Price')
plt.scatter(X_test['milage'], y_pred_knn, alpha=0.5, label='Predicted (KNN)')
plt.xlabel('Mileage')
plt.ylabel('Price')
plt.title('KNN Regression: Actual vs. Predicted')
plt.legend()
plt.show()