Patrick Hollenbach & William Halm
Finance 4931/5931 HW3

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math


In [None]:
used_car_df = pd.read_csv('used_cars.csv')

In [None]:
used_car_df.head()

In [None]:
#1.1 Correct typo
#lowercase and remove spaces
used_car_df.columns = [col.strip().lower() for col in used_car_df.columns]
#rename to mileage
used_car_df.rename(columns={'milage': 'mileage'}, inplace=True)
#verify
print(used_car_df.columns.tolist())

In [None]:
#1.2 Treat mileage and model_year as numbers
used_car_df['mileage_num'] = used_car_df['mileage'].str.extract(r'([\d,]+)')[0].str.replace(',', '').astype(float)
used_car_df['price_num'] = used_car_df['price'].str.extract(r'([\d,]+)')[0].str.replace(',', '').astype(float)

In [None]:
#1.3 Encode categorical variables
categorical_columns = ['fuel_type', 'clean_title', 'engine', 'transmission', 'ext_col', 'int_col']
used_car_df = pd.get_dummies(used_car_df, columns=categorical_columns, drop_first=True)

In [None]:
#1.4 Remove anomolies
print(f'before getting rid of anomalies, there are {used_car_df.shape[0]} rows')
used_car_df = used_car_df[(used_car_df['price_num'] < 300000) & (used_car_df['mileage_num'] < 250000) ]
used_car_df = used_car_df[(used_car_df['price_num'] > 0) & (used_car_df['mileage_num'] > 0) ]
print(f'after getting rid of anomalies, there are {used_car_df.shape[0]} rows')

In [None]:
#2.1 Linear Regression Model - Feature Selection
X = used_car_df[['mileage_num', 'model_year']+ [col for col in used_car_df.columns if col.startswith(tuple(categorical_columns))]]
y = used_car_df['price_num']

In [None]:
#2.2 Split data set - 80% training 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
#2.3 Train linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
#2.4 Evaluate the model -RMSE
y_pred = model.predict(X_test)
rmse = math.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')


In [None]:
#2.5 Visualization
plt.figure(figsize=(8, 6))

plt.scatter(X_test['mileage_num'], y_test, label="Actual Price", color='blue', alpha=0.6)
plt.scatter(X_test['mileage_num'], y_pred, label="Predicted Price", color='red', alpha=0.6)

plt.xlabel('Mileage')
plt.ylabel('Price')
plt.title('Actual vs Predicted Prices')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
#3.1 KNN Regression
# Choose a k
k = 5

# Create and train KNN model
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(X_train, y_train)

# Generate predictions
mileage_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
y_knn_pred = knn.predict(X_test)

#3.2 RMSE
knn_rmse = math.sqrt(mean_squared_error(y_test, y_knn_pred))
print(f'KNN RMSE: {knn_rmse}')

#3.3 Visualization
plt.scatter(X_test['mileage_num'], y_test, label="Actual Price", color='blue', alpha=0.6)
plt.scatter(X_test['mileage_num'], y_knn_pred, label=f"KNN Predicted Price (k={k})", color='red', alpha=0.6)

plt.xlabel('Mileage')
plt.ylabel('Price')
plt.title('Actual vs Predicted Prices')
plt.legend()
plt.grid(True)
plt.show()