# Task
 In this assignment, we will work with the used car dataset, applying data
 cleansing, linear regression, and K-Nearest Neighbors regression to
 predict car prices.

## Data loading


In [None]:
import pandas as pd

try:
    df = pd.read_csv('used_cars.csv')
    display(df.head())
except FileNotFoundError:
    print("Error: 'used_cars.csv' not found.")
    df = None

## Data cleaning


In [None]:
import numpy as np

# Rename the 'milage' column to 'mileage'
df = df.rename(columns={'milage': 'mileage'})

# Convert 'mileage' and 'model_year' to numeric, handling errors
for col in ['mileage', 'model_year']:
    df[col] = pd.to_numeric(df[col].astype(str).str.replace(r'[^\d.]', '', regex=True), errors='coerce')

# Fill NaN values in 'mileage' and 'model_year' with the median
for col in ['mileage', 'model_year']:
    median_value = df[col].median()
    df[col] = df[col].fillna(median_value)

# Handle missing values in other columns (simple imputation for demonstration)
for col in df.columns:
    if df[col].isnull().any():
        if df[col].dtype == 'object':
            df[col] = df[col].fillna(df[col].mode()[0])
        else:
            df[col] = df[col].fillna(df[col].mean())

# Remove duplicate rows
df = df.drop_duplicates()

display(df.head())

In [None]:
categorical_cols = ["fuel_type", "clean_title", "engine", "transmission", "ext_col", "int_col"]
try:
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    display(df.head())
except Exception as e:
    print(f"An error occurred during one-hot encoding: {e}")
    df = None

## Data exploration


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert 'price' to numeric, handling errors
df['price'] = pd.to_numeric(df['price'].astype(str).str.replace(r'[^\d.]', '', regex=True), errors='coerce')

# Analyze distributions
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(df['price'], kde=True)
plt.title('Price Distribution')
plt.xlabel('Price')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
sns.histplot(df['mileage'], kde=True)
plt.title('Mileage Distribution')
plt.xlabel('Mileage')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Boxplots
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.boxplot(y=df['price'])
plt.title('Price Boxplot')

plt.subplot(1, 2, 2)
sns.boxplot(y=df['mileage'])
plt.title('Mileage Boxplot')

plt.tight_layout()
plt.show()

# Descriptive statistics
print(df[['price', 'mileage']].describe())

# Relationship between price and mileage
plt.figure(figsize=(8, 6))
sns.scatterplot(x='mileage', y='price', data=df)
plt.title('Price vs. Mileage')
plt.xlabel('Mileage')
plt.ylabel('Price')
plt.show()

# Explore correlations with other features (example with model_year)
plt.figure(figsize=(8, 6))
sns.scatterplot(x='model_year', y='price', data=df)
plt.title('Price vs. Model Year')
plt.xlabel('Model Year')
plt.ylabel('Price')
plt.show()

## Data cleaning

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the 1st and 99th percentiles for 'price' and 'mileage'
price_lower_bound = df['price'].quantile(0.05)
price_upper_bound = df['price'].quantile(0.95)
mileage_lower_bound = df['mileage'].quantile(0.01)
mileage_upper_bound = df['mileage'].quantile(0.99)

# Filter out outliers
filtered_df = df[
    (df['price'] >= price_lower_bound) & (df['price'] <= price_upper_bound) &
    (df['mileage'] >= mileage_lower_bound) & (df['mileage'] <= mileage_upper_bound)
]

# Plot histograms of the filtered data
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(filtered_df['price'], kde=True)
plt.title('Price Distribution (Filtered)')

plt.subplot(1, 2, 2)
sns.histplot(filtered_df['mileage'], kde=True)
plt.title('Mileage Distribution (Filtered)')
plt.tight_layout()
plt.show()

# Plot boxplots of the filtered data
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.boxplot(y=filtered_df['price'])
plt.title('Price Boxplot (Filtered)')

plt.subplot(1, 2, 2)
sns.boxplot(y=filtered_df['mileage'])
plt.title('Mileage Boxplot (Filtered)')
plt.tight_layout()
plt.show()

# Print some info on the filtered data
print(f"Original df shape: {df.shape}")
print(f"Filtered df shape: {filtered_df.shape}")

## Data preparation


In [None]:
# Select features (independent variables)
features = ['mileage', 'model_year']
one_hot_encoded_cols = [col for col in filtered_df.columns if col.startswith(('fuel_type_', 'clean_title_', 'engine_', 'transmission_', 'ext_col_', 'int_col_'))]
features.extend(one_hot_encoded_cols)
X = filtered_df[features]

# Select target variable (dependent variable)
y = filtered_df['price']

# Ensure correct data types for scikit-learn
X = X.values
y = y.values

print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

## Data splitting

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

## Model training

### Subtask:
Train a linear regression model.


In [None]:
from sklearn.linear_model import LinearRegression

# Instantiate a LinearRegression object
lr_model = LinearRegression()

# Train the model
lr_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_lr = lr_model.predict(X_test)
y_pred_lr = np.clip(y_pred_lr, 7500, None)

## Model evaluation


In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Calculate the mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred_lr)

# Calculate the root mean squared error (RMSE)
rmse = np.sqrt(mse)

# Print the RMSE value
print(f"Root Mean Squared Error (RMSE): {rmse}")

## Data visualization


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.scatter(X_test[:, 0], y_test, color='blue', label='Actual Prices')
plt.scatter(X_test[:, 0], y_pred_lr, color='red', label='Predicted Prices')
plt.xlabel('Mileage')
plt.ylabel('Price')
plt.title('Actual vs. Predicted Prices')
plt.legend()
plt.show()

## Model training

Train a K-Nearest Neighbors (KNN) regression model.


In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import numpy as np


n_neighbors_values = [3, 5, 7]
rmse_values = []
y_pred_knn_values = []

for n_neighbors in n_neighbors_values:
    # Instantiate a KNeighborsRegressor object
    knn_model = KNeighborsRegressor(n_neighbors=n_neighbors)

    # Train the model
    knn_model.fit(X_train, y_train)

    # Generate predictions
    y_pred_knn = knn_model.predict(X_test)
    y_pred_knn = np.clip(y_pred_knn, 0, None)
    y_pred_knn_values.append(y_pred_knn)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred_knn)
    rmse = np.sqrt(mse)
    rmse_values.append(rmse)

    print(f"For n_neighbors = {n_neighbors}: RMSE = {rmse}")

## Model evaluation

Evaluate the KNN regression model's performance using Root Mean Squared Error (RMSE).


In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Calculate the mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred_knn_values[2])

# Calculate the root mean squared error (RMSE)
rmse = np.sqrt(mse)

# Print the RMSE value
print(f"Root Mean Squared Error (RMSE) for KNN (n_neighbors=7): {rmse}")

## Data visualization

Visualize actual vs. predicted prices for the KNN model.


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.scatter(X_test[:, 0], y_test, color='blue', label='Actual Prices')
plt.scatter(X_test[:, 0], y_pred_knn_values[2], color='red', label='Predicted Prices')
plt.xlabel('Mileage')
plt.ylabel('Price')
plt.title('Actual vs. Predicted Prices (KNN, k=7)')
plt.legend()
plt.show()

## Summary:

### 1. Q&A

* **What is the RMSE of the linear regression model?** 11911
* **What is the RMSE of the KNN model with 7 neighbors?** 18015
* **Which model performed better based on RMSE?**  The linear regression model (RMSE = 11911) performed better than the KNN model with 7 neighbors (RMSE = 18015).

** A floor of 7,500 was set since this model began predicting $0 car worth for some data points

### 2. Data Analysis Key Findings

* **Outlier Handling:** Outliers in 'price' were removed using the 5th and 95th percentiles and 'Mileage' was removed using the 1st and 99th percentiles, reducing the dataset size from 4009 to 2844 rows.

** The inital model produced an RMSE of 20247 so we increased the parameter usage to decrease the RMSE                                                       
                **Google Gemini was used to help produce code**