In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer, SimpleImputer, KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score 
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

In [4]:
df = pd.read_csv('data/df_imputing.csv')

In [5]:
df.head()

Unnamed: 0,price,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state
0,33590.0,2014.0,gmc,good,8 cylinders,gas,57923.0,clean,other,,,pickup,white,al
1,22590.0,2010.0,chevrolet,good,8 cylinders,gas,71229.0,clean,other,,,pickup,blue,al
2,39590.0,2020.0,chevrolet,good,8 cylinders,gas,19160.0,clean,other,,,pickup,red,al
3,30990.0,2017.0,toyota,good,8 cylinders,gas,41124.0,clean,other,,,pickup,red,al
4,15000.0,2013.0,ford,excellent,6 cylinders,gas,128000.0,clean,automatic,rwd,full-size,truck,black,al


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 258940 entries, 0 to 258939
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   price         258940 non-null  float64
 1   year          258937 non-null  float64
 2   manufacturer  258940 non-null  object 
 3   condition     167071 non-null  object 
 4   cylinders     160807 non-null  object 
 5   fuel          257606 non-null  object 
 6   odometer      257861 non-null  float64
 7   title_status  255548 non-null  object 
 8   transmission  257814 non-null  object 
 9   drive         184571 non-null  object 
 10  size          83481 non-null   object 
 11  type          201080 non-null  object 
 12  paint_color   188093 non-null  object 
 13  state         258940 non-null  object 
dtypes: float64(3), object(11)
memory usage: 27.7+ MB


In [7]:
df['year'].sort_values().unique()

array([1900., 1905., 1913., 1915., 1916., 1918., 1921., 1922., 1923.,
       1924., 1925., 1926., 1927., 1928., 1929., 1930., 1931., 1932.,
       1933., 1934., 1935., 1936., 1937., 1938., 1939., 1940., 1941.,
       1942., 1943., 1944., 1945., 1946., 1947., 1948., 1949., 1950.,
       1951., 1952., 1953., 1954., 1955., 1956., 1957., 1958., 1959.,
       1960., 1961., 1962., 1963., 1964., 1965., 1966., 1967., 1968.,
       1969., 1970., 1971., 1972., 1973., 1974., 1975., 1976., 1977.,
       1978., 1979., 1980., 1981., 1982., 1983., 1984., 1985., 1986.,
       1987., 1988., 1989., 1990., 1991., 1992., 1993., 1994., 1995.,
       1996., 1997., 1998., 1999., 2000., 2001., 2002., 2003., 2004.,
       2005., 2006., 2007., 2008., 2009., 2010., 2011., 2012., 2013.,
       2014., 2015., 2016., 2017., 2018., 2019., 2020., 2021., 2022.,
         nan])

In [8]:
df['manufacturer'].sort_values().unique()

array(['acura', 'alfa-romeo', 'aston-martin', 'audi', 'bmw', 'buick',
       'cadillac', 'chevrolet', 'chrysler', 'datsun', 'dodge', 'ferrari',
       'fiat', 'ford', 'gmc', 'harley-davidson', 'honda', 'hyundai',
       'infiniti', 'jaguar', 'jeep', 'kia', 'land rover', 'lexus',
       'lincoln', 'mazda', 'mercedes-benz', 'mercury', 'mini',
       'mitsubishi', 'morgan', 'nissan', 'pontiac', 'porsche', 'ram',
       'saturn', 'subaru', 'tesla', 'toyota', 'volkswagen', 'volvo'],
      dtype=object)

In [9]:
df['condition'].sort_values().unique()

array(['excellent', 'fair', 'good', 'like new', 'new', 'salvage', nan],
      dtype=object)

In [10]:
df['cylinders'].sort_values().unique()

array(['10 cylinders', '12 cylinders', '3 cylinders', '4 cylinders',
       '5 cylinders', '6 cylinders', '8 cylinders', 'other', nan],
      dtype=object)

In [11]:
df['fuel'].sort_values().unique()

array(['diesel', 'electric', 'gas', 'hybrid', 'other', nan], dtype=object)

In [12]:
df['odometer'].sort_values().head(3)

178273    0.0
33307     0.0
18909     0.0
Name: odometer, dtype: float64

In [13]:
df['title_status'].sort_values().unique()

array(['clean', 'lien', 'missing', 'parts only', 'rebuilt', 'salvage',
       nan], dtype=object)

In [14]:
df['transmission'].sort_values().unique()

array(['automatic', 'manual', 'other', nan], dtype=object)

In [15]:
df['drive'].sort_values().unique()

array(['4wd', 'fwd', 'rwd', nan], dtype=object)

In [16]:
df['size'].sort_values().unique()

array(['compact', 'full-size', 'mid-size', 'sub-compact', nan],
      dtype=object)

In [17]:
df['type'].sort_values().unique()

array(['SUV', 'bus', 'convertible', 'coupe', 'hatchback', 'mini-van',
       'offroad', 'other', 'pickup', 'sedan', 'truck', 'van', 'wagon',
       nan], dtype=object)

In [18]:
df['paint_color'].sort_values().unique()

array(['black', 'blue', 'brown', 'custom', 'green', 'grey', 'orange',
       'purple', 'red', 'silver', 'white', 'yellow', nan], dtype=object)

In [32]:
df['state'].sort_values().unique()

array(['ak', 'al', 'ar', 'az', 'ca', 'co', 'ct', 'dc', 'de', 'fl', 'ga',
       'hi', 'ia', 'id', 'il', 'in', 'ks', 'ky', 'la', 'ma', 'md', 'me',
       'mi', 'mn', 'mo', 'ms', 'mt', 'nc', 'nd', 'ne', 'nh', 'nj', 'nm',
       'nv', 'ny', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx',
       'ut', 'va', 'vt', 'wa', 'wi', 'wv', 'wy'], dtype=object)

In [19]:
cat_features = df.select_dtypes('object').columns.values
num_features = ['year', 'odometer']

In [20]:
pipe_cont = Pipeline([
    ('imp_knn', KNNImputer()),
    ('ss', StandardScaler())
])

pipe_cats = Pipeline(
    [
        ('ohe', OneHotEncoder(drop = 'first', sparse_output=False)),
        ('imp_knn', KNNImputer())
    ]
)

In [21]:
col_trans = ColumnTransformer(
    [
        ('num_ct', pipe_cont, make_column_selector(dtype_include = np.number)),
        ('cat_ct', pipe_cats, make_column_selector(dtype_include = object))
    ]
)

In [22]:
rf_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('rf', RandomForestRegressor(n_estimators=150, min_samples_leaf=3, min_samples_split=5))
    ]
)

In [23]:
X = df.drop(columns='price')
y = df['price']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

In [25]:
rf_pipe.fit(X_train, y_train)

In [26]:
rf_pipe.score(X_train, y_train), rf_pipe.score(X_test,y_test)

(0.9245763258087301, 0.836315178672695)

In [27]:
etr_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('etr', ExtraTreesRegressor(n_estimators=100, min_samples_leaf=3, min_samples_split=5))
    ]
)

In [28]:
etr_pipe.fit(X_train, y_train)

In [29]:
etr_pipe.score(X_train, y_train), etr_pipe.score(X_test,y_test)

(0.9410951832061604, 0.8413400814669261)

In [30]:
# with open('streamlit_app/etr_pipe.pkl', 'wb') as f:
#     pickle.dump(etr_pipe, f)

In [31]:
# with open('streamlit_app/rf_pipe.pkl', 'wb') as f:
#     pickle.dump(rf_pipe, f)