Imports

In [70]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

In [6]:
for dirname, _, filenames in os.walk('cars.csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [7]:
pd.set_option('display.max_columns', 100)

In [None]:
cars = pd.read_csv('cars.csv')
cars.head()
cars = cars.drop(columns=['feature_0','feature_1','feature_2','feature_3','feature_4','feature_5','feature_6','feature_7','feature_8','feature_9'], axis=1)
cars.head()
cars.shape
cars.describe()
cars.columns
cars.dtypes
cars.isnull().sum()
cars = cars.dropna()
cars.isnull().sum()

In [9]:
cars['age'] = 2023 - cars['year_produced']
cars.head()

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,body_type,has_warranty,state,drivetrain,price_usd,is_exchangeable,location_region,number_of_photos,up_counter,duration_listed,age
0,Subaru,Outback,automatic,silver,190000,2010,gasoline,False,gasoline,2.5,universal,False,owned,all,10900.0,False,Минская обл.,9,13,16,13
1,Subaru,Outback,automatic,blue,290000,2002,gasoline,False,gasoline,3.0,universal,False,owned,all,5000.0,True,Минская обл.,12,54,83,21
2,Subaru,Forester,automatic,red,402000,2001,gasoline,False,gasoline,2.5,suv,False,owned,all,2800.0,True,Минская обл.,4,72,151,22
3,Subaru,Impreza,mechanical,blue,10000,1999,gasoline,False,gasoline,3.0,sedan,False,owned,all,9999.0,True,Минская обл.,9,42,86,24
4,Subaru,Legacy,automatic,black,280000,2001,gasoline,False,gasoline,2.5,universal,False,owned,all,2134.11,True,Гомельская обл.,14,7,7,22


In [10]:
cars_numeric = cars.select_dtypes(include=['float64', 'int64'])
cars_numeric.head()

Unnamed: 0,odometer_value,year_produced,engine_capacity,price_usd,number_of_photos,up_counter,duration_listed,age
0,190000,2010,2.5,10900.0,9,13,16,13
1,290000,2002,3.0,5000.0,12,54,83,21
2,402000,2001,2.5,2800.0,4,72,151,22
3,10000,1999,3.0,9999.0,9,42,86,24
4,280000,2001,2.5,2134.11,14,7,7,22


In [13]:
cars['price_usd'] = cars['price_usd'].astype('float64')
temp = cars.copy()
table = temp.groupby(['manufacturer_name'])['price_usd'].mean()
temp = temp.merge(table.reset_index(), how='left', on='manufacturer_name')
table

manufacturer_name
Acura            12772.885909
Alfa Romeo        2688.550097
Audi              7154.944923
BMW               9532.098405
Buick            12876.319149
Cadillac         11093.124651
Chery             4545.978103
Chevrolet         8863.759885
Chrysler          4995.491780
Citroen           4433.126204
Dacia             5342.751525
Daewoo            1576.608733
Dodge             5608.011953
Fiat              2979.752689
Ford              4993.888044
Geely             7769.232535
Great Wall        6423.760278
Honda             6515.096474
Hyundai           7926.190394
Infiniti         13794.604938
Iveco            10052.317122
Jaguar           17813.000000
Jeep             10912.489439
Kia               8156.095526
LADA              7598.681370
Lancia            2901.508370
Land Rover       15195.300380
Lexus            17130.560845
Lifan             8280.536170
Lincoln           9737.472222
Mazda             4731.527101
Mercedes-Benz     9389.817072
Mini             13133

In [14]:
cars.head()

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,body_type,has_warranty,state,drivetrain,price_usd,is_exchangeable,location_region,number_of_photos,up_counter,duration_listed,age
0,Subaru,Outback,automatic,silver,190000,2010,gasoline,False,gasoline,2.5,universal,False,owned,all,10900.0,False,Минская обл.,9,13,16,13
1,Subaru,Outback,automatic,blue,290000,2002,gasoline,False,gasoline,3.0,universal,False,owned,all,5000.0,True,Минская обл.,12,54,83,21
2,Subaru,Forester,automatic,red,402000,2001,gasoline,False,gasoline,2.5,suv,False,owned,all,2800.0,True,Минская обл.,4,72,151,22
3,Subaru,Impreza,mechanical,blue,10000,1999,gasoline,False,gasoline,3.0,sedan,False,owned,all,9999.0,True,Минская обл.,9,42,86,24
4,Subaru,Legacy,automatic,black,280000,2001,gasoline,False,gasoline,2.5,universal,False,owned,all,2134.11,True,Гомельская обл.,14,7,7,22


In [15]:
scaler = MinMaxScaler()
num_vars = ['price_usd','year_produced']
cars[num_vars] = scaler.fit_transform(cars[num_vars])
cars.head()

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,body_type,has_warranty,state,drivetrain,price_usd,is_exchangeable,location_region,number_of_photos,up_counter,duration_listed,age
0,Subaru,Outback,automatic,silver,190000,0.883117,gasoline,False,gasoline,2.5,universal,False,owned,all,0.217984,False,Минская обл.,9,13,16,13
1,Subaru,Outback,automatic,blue,290000,0.779221,gasoline,False,gasoline,3.0,universal,False,owned,all,0.099982,True,Минская обл.,12,54,83,21
2,Subaru,Forester,automatic,red,402000,0.766234,gasoline,False,gasoline,2.5,suv,False,owned,all,0.055981,True,Минская обл.,4,72,151,22
3,Subaru,Impreza,mechanical,blue,10000,0.74026,gasoline,False,gasoline,3.0,sedan,False,owned,all,0.199964,True,Минская обл.,9,42,86,24
4,Subaru,Legacy,automatic,black,280000,0.766234,gasoline,False,gasoline,2.5,universal,False,owned,all,0.042663,True,Гомельская обл.,14,7,7,22


In [18]:
X = cars.drop(['price_usd'], axis=1)
y = cars['price_usd']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 21)

In [22]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(26964, 20)
(11557, 20)
(26964,)
(11557,)


In [24]:
cars.head()

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,body_type,has_warranty,state,drivetrain,price_usd,is_exchangeable,location_region,number_of_photos,up_counter,duration_listed,age
0,Subaru,Outback,automatic,silver,190000,0.883117,gasoline,False,gasoline,2.5,universal,False,owned,all,0.217984,False,Минская обл.,9,13,16,13
1,Subaru,Outback,automatic,blue,290000,0.779221,gasoline,False,gasoline,3.0,universal,False,owned,all,0.099982,True,Минская обл.,12,54,83,21
2,Subaru,Forester,automatic,red,402000,0.766234,gasoline,False,gasoline,2.5,suv,False,owned,all,0.055981,True,Минская обл.,4,72,151,22
3,Subaru,Impreza,mechanical,blue,10000,0.74026,gasoline,False,gasoline,3.0,sedan,False,owned,all,0.199964,True,Минская обл.,9,42,86,24
4,Subaru,Legacy,automatic,black,280000,0.766234,gasoline,False,gasoline,2.5,universal,False,owned,all,0.042663,True,Гомельская обл.,14,7,7,22


In [27]:
cars = cars[cars["engine_fuel"].isin(cars["engine_type"])]

In [28]:
cars['engine_fuel'].value_counts()

gasoline    24065
diesel      12872
Name: engine_fuel, dtype: int64

In [31]:
cars['location_region'].value_counts()

Минская обл.        23398
Гомельская обл.      3014
Витебская обл.       2896
Брестская обл.       2784
Могилевская обл.     2527
Гродненская обл.     2318
Name: location_region, dtype: int64

In [50]:
cars_df = pd.read_csv("modified_cars.csv")
cars_df.head()

Unnamed: 0,transmission,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_capacity,has_warranty,drivetrain,price_usd,is_exchangeable,number_of_photos,up_counter,duration_listed,age
0,1,190000,0.883117,0,False,2.5,False,2,0.217984,False,9,13,16,13
1,1,290000,0.779221,0,False,3.0,False,2,0.099982,True,12,54,83,21
2,1,402000,0.766234,0,False,2.5,False,2,0.055981,True,4,72,151,22
3,0,10000,0.74026,0,False,3.0,False,2,0.199964,True,9,42,86,24
4,1,280000,0.766234,0,False,2.5,False,2,0.042663,True,14,7,7,22


In [51]:

value_map_ehg = {'FALSE':0, 'TRUE':1}
cars_df["engine_has_gas"] = cars_df["engine_has_gas"].map(value_map_ehg).fillna(cars_df["engine_has_gas"])

value_map_hw = {'FALSE':0, 'TRUE':1}
cars_df["has_warranty"] = cars_df["has_warranty"].map(value_map_hw).fillna(cars_df["has_warranty"])


value_map_ex = {'FALSE':0, 'TRUE':1}
cars_df["is_exchangeable"] = cars_df["is_exchangeable"].map(value_map_ex).fillna(cars_df["is_exchangeable"])

new_path = "final_cars.csv"
cars_df.to_csv(new_path,index=False)




In [52]:

cars_df = pd.read_csv("modified_cars.csv")

In [53]:
cars_df.head()

Unnamed: 0,transmission,odometer_value,engine_fuel,engine_capacity,drivetrain,price_usd,number_of_photos,up_counter,duration_listed,age
0,1,190000,0,2.5,2,0.217984,9,13,16,13
1,1,290000,0,3.0,2,0.099982,12,54,83,21
2,1,402000,0,2.5,2,0.055981,4,72,151,22
3,0,10000,0,3.0,2,0.199964,9,42,86,24
4,1,280000,0,2.5,2,0.042663,14,7,7,22


In [54]:
cars_df["engine_fuel"].value_counts()

0    24065
1    12872
Name: engine_fuel, dtype: int64

In [58]:
X = cars_df.drop(['price_usd'], axis=1)
y = cars_df['price_usd']

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 21)

Linear Regression Model

In [60]:
lm = LinearRegression()
lm.fit(X_train, y_train)
lm_y_pred = lm.predict(X_test)
print(lm.coef_)
print(lm.intercept_)
lm_mse = mean_squared_error(lm_y_pred, y_test)
lm_r2 = r2_score(lm_y_pred, y_test)
print(f"Mean Squared Error: {lm_mse}")
print(f"R2 Score: {lm_r2}")

[ 2.19633230e-02 -1.33915326e-07  2.66805508e-02  2.60083676e-02
  4.02410406e-02  1.99388665e-03  2.79073310e-05  1.51082507e-05
 -8.92541403e-03]
0.23766474482673003
Mean Squared Error: 0.005671291513385428
R2 Score: 0.49523719512611986


Random Forest

In [61]:
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)

rfr_y_pred = rfr.predict(X_test)
rfr_mse = mean_squared_error(rfr_y_pred, y_test)
rfr_r2 = r2_score(rfr_y_pred, y_test)
print(f"Mean Squared Error: {rfr_mse}")
print(f"R2 Score: {rfr_r2}")

Mean Squared Error: 0.0022291783365670508
R2 Score: 0.8529616738260144


KNN

In [71]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

knn_y_pred = knn.predict(X_test)
knn_mse = mean_squared_error(knn_y_pred, y_test)
knn_r2 = r2_score(knn_y_pred, y_test)
print(f"Mean Squared Error: {knn_mse}")
print(f"R2 Score: {knn_r2}")

ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [62]:
lm_cv_score = cross_val_score(lm, X_train, y_train, cv=5)
print(f"Linear Regression cross validation score: {lm_cv_score}")

Linear Regression cross validation score: [0.65876463 0.66995463 0.66555075 0.64407869 0.6481265 ]


In [63]:
rfr_cv_score = cross_val_score(rfr, X_train, y_train, cv=5)
print(f"Random Forest Regression cross validation: {rfr_cv_score}")

Random Forest Regression cross validation: [0.87295541 0.87234069 0.86663064 0.86044573 0.85792035]


In [65]:
if lm_cv_score.mean() > rfr_cv_score.mean():
    mld_select = lm
    print("Linear Regression Model is selected.")
else:
    mld_select = rfr
    print("Random Forest Regression is selected.")
    
    selected_model = mld_select.fit(X_train, y_train)
print(selected_model)

Random Forest Regression is selected.
RandomForestRegressor()


In [67]:
final_pred = selected_model.predict(X_test)
final_pred
fnl_rfr_mse = mean_squared_error(final_pred, y_test)
fnl_rfr_r2 = r2_score(final_pred, y_test)
print(f"Mean Squared Error: {fnl_rfr_mse}")
print(f"R2 Score: {fnl_rfr_r2}")

lm_acc = lm.score(X_test, y_test)
print(f"Linear model accurcy score : {lm_acc}")
rfr_acc = rfr.score(X_test,y_test)
print(f"Random Forest Regression accurcy score : {lm_acc}")

Mean Squared Error: 0.002216637795493246
R2 Score: 0.8535683202327871
Linear model accurcy score : 0.6612307585923843
Random Forest Regression accurcy score : 0.6612307585923843


Try here

In [83]:
new_data = [1,190000,0,2.5,2,0.21798436,9,13,16,13]
new_data = np.array(new_data)
new_data.reshape(-1,1)


array([[1.0000000e+00],
       [1.9000000e+05],
       [0.0000000e+00],
       [2.5000000e+00],
       [2.0000000e+00],
       [2.1798436e-01],
       [9.0000000e+00],
       [1.3000000e+01],
       [1.6000000e+01],
       [1.3000000e+01]])

In [None]:
# Assuming you have new data in the variable 'new_data'
linear_predictions = lm.predict(new_data)

In [None]:
# Assuming you have new data in the variable 'new_data'
rf_predictions = rfr.predict(new_data)


In [None]:
linear_mse = mean_squared_error(true_labels, linear_predictions)
linear_r2 = r2_score(true_labels, linear_predictions)

rf_mse = mean_squared_error(true_labels, rf_predictions)
rf_r2 = r2_score(true_labels, rf_predictions)

print(f"Linear Regression MSE: {linear_mse}, R-squared: {linear_r2}")
print(f"Random Forest MSE: {rf_mse}, R-squared: {rf_r2}")