In [13]:
import pandas as pd
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

In [14]:
df = pd.read_csv("../data/autoscout24.csv")
print(df.shape)
df.head(3)

(39822, 8)


Unnamed: 0,mileage,make,model,fuel,gear,price,hp,year
0,235000,bmw,316,diesel,manual,6800,116.0,2011
1,92800,volkswagen,golf,gasoline,manual,6877,122.0,2011
2,149300,seat,exeo,gasoline,manual,6900,160.0,2011


In [15]:
df['make'].unique()

array(['bmw', 'volkswagen', 'seat', 'renault', 'peugeot', 'toyota',
       'opel', 'mazda', 'ford', 'mercedes-benz', 'chevrolet', 'audi',
       'fiat', 'kia', 'dacia', 'mini', 'hyundai', 'skoda', 'citroen',
       'infiniti', 'suzuki', 'ssangyong', 'smart', 'volvo', 'jaguar',
       'porsche', 'nissan', 'honda', 'mitsubishi', 'lexus', 'cupra',
       'maserati', 'bentley', 'land', 'alfa', 'jeep', 'subaru', 'dodge',
       'microcar', 'lamborghini', 'lada', 'tesla', 'chrysler', 'mclaren',
       'aston', 'rolls-royce', 'lancia', 'abarth', 'ds', 'daihatsu',
       'ligier', 'aixam', 'morgan', 'maybach', 'ram', 'ferrari', 'alpina',
       'polestar', 'brilliance', 'piaggio', 'fisker', 'others',
       'cadillac', 'iveco', 'isuzu', 'corvette', 'baic', 'dfsk',
       'estrima'], dtype=object)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39822 entries, 0 to 39821
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   mileage  39822 non-null  int64  
 1   make     39822 non-null  object 
 2   model    39822 non-null  object 
 3   fuel     39822 non-null  object 
 4   gear     39822 non-null  object 
 5   price    39822 non-null  int64  
 6   hp       39822 non-null  float64
 7   year     39822 non-null  int64  
dtypes: float64(1), int64(3), object(4)
memory usage: 2.4+ MB


In [17]:
df['year'].value_counts()

year
2013    4236
2016    4225
2017    4179
2012    4177
2015    4172
2018    4152
2014    4140
2011    4095
2019    3761
2020    2083
2021     602
Name: count, dtype: int64

In [18]:
X = df.drop(columns=['price'])
y = df['price']

In [19]:
numeric_features = ['hp', 'mileage', 'year']
numeric_transformer = StandardScaler()

In [20]:
categorical_features = ['make', 'model', 'fuel', 'gear']
categorical_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

In [21]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [22]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())  # Change to SVM or other models as needed
])

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Evaluate the model on the test set
score = pipeline.score(X_test, y_test)
print(f'Model R^2 score on test set: {score}')

Model R^2 score on test set: 0.8654461022353152




In [24]:
joblib.dump(pipeline, '../data/models/pipeline.pkl')

['../data/models/pipeline.pkl']