In [1]:
import pandas as pd
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

In [2]:
df = pd.read_csv("../data/clean/data.csv")
print(df.shape)
df.head(3)

(725139, 12)


Unnamed: 0,date_sold_new,make,model,year,sale_price_new,car_age_sold_new,fuel,gear,hp,used_mileage,sale_price_used,price_difference_sold_used
0,2022-05-13,chevrolet,silverado,2019,41216,3,gasoline,automatic,355.0,14310.0,68900.0,27684.0
1,2022-05-25,honda,civic,2011,17997,11,gasoline,manual,99.0,61999.0,7949.0,-10048.0
2,2022-05-25,honda,civic,2011,17997,11,gasoline,manual,99.0,221300.0,4450.0,-13547.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 725139 entries, 0 to 725138
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   date_sold_new               725139 non-null  object 
 1   make                        725139 non-null  object 
 2   model                       725139 non-null  object 
 3   year                        725139 non-null  int64  
 4   sale_price_new              725139 non-null  int64  
 5   car_age_sold_new            725139 non-null  int64  
 6   fuel                        725139 non-null  object 
 7   gear                        725139 non-null  object 
 8   hp                          725139 non-null  float64
 9   used_mileage                725139 non-null  float64
 10  sale_price_used             725139 non-null  float64
 11  price_difference_sold_used  725139 non-null  float64
dtypes: float64(4), int64(3), object(5)
memory usage: 66.4+ MB


In [11]:
df['year'].value_counts()

year
2014    121534
2020    114865
2013     92952
2019     92647
2021     77830
2015     70272
2011     46662
2012     38955
2016     30794
2018     23202
2017     15426
Name: count, dtype: int64

In [4]:
X = df.drop(columns=['date_sold_new', 'car_age_sold_new', 'sale_price_used'])
y = df['price_difference_sold_used']

In [5]:
numeric_features = ['hp', 'used_mileage', 'year', 'sale_price_new']
numeric_transformer = StandardScaler()

In [6]:
categorical_features = ['make', 'model', 'fuel', 'gear']
categorical_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [8]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())  # Change to SVM or other models as needed
])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Evaluate the model on the test set
score = pipeline.score(X_test, y_test)
print(f'Model R^2 score on test set: {score}')

Model R^2 score on test set: 0.9790382603745849


In [10]:
joblib.dump(pipeline, '../data/models/pipeline.pkl')

['../data/models/pipeline.pkl']