# **Cars 4 You | Machine Learning Project**

### **Import Libraries**

In [35]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder

### **Import Data**

In [3]:
df = pd.read_csv('../data/train.csv')

### **Data Understanding**

In [4]:
df.head()

Unnamed: 0,carID,Brand,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,paintQuality%,previousOwners,hasDamage
0,69512,VW,Golf,2016.0,22290,Semi-Auto,28421.0,Petrol,,11.417268,2.0,63.0,4.0,0.0
1,53000,Toyota,Yaris,2019.0,13790,Manual,4589.0,Petrol,145.0,47.9,1.5,50.0,1.0,0.0
2,6366,Audi,Q2,2019.0,24990,Semi-Auto,3624.0,Petrol,145.0,40.9,1.5,56.0,4.0,0.0
3,29021,Ford,FIESTA,2018.0,12500,anual,9102.0,Petrol,145.0,65.7,1.0,50.0,-2.340306,0.0
4,10062,BMW,2 Series,2019.0,22995,Manual,1000.0,Petrol,145.0,42.8,1.5,97.0,3.0,0.0


In [5]:
df.describe(include='all')

Unnamed: 0,carID,Brand,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,paintQuality%,previousOwners,hasDamage
count,75973.0,74452,74456,74482.0,75973.0,74451,74510.0,74462,68069.0,68047.0,74457.0,74449.0,74423.0,74425.0
unique,,72,735,,,40,,34,,,,,,
top,,Ford,Focus,,,Manual,,Petrol,,,,,,
freq,,14808,6353,,,38050,,37995,,,,,,
mean,37986.0,,,2017.096611,16881.889553,,23004.184088,,120.329078,55.152666,1.660136,64.590667,1.99458,0.0
std,21931.660338,,,2.208704,9736.926322,,22129.788366,,65.521176,16.497837,0.573462,21.021065,1.472981,0.0
min,0.0,,,1970.0,450.0,,-58540.574478,,-91.12163,-43.421768,-0.103493,1.638913,-2.34565,0.0
25%,18993.0,,,2016.0,10200.0,,7423.25,,125.0,46.3,1.2,47.0,1.0,0.0
50%,37986.0,,,2017.0,14699.0,,17300.0,,145.0,54.3,1.6,65.0,2.0,0.0
75%,56979.0,,,2019.0,20950.0,,32427.5,,145.0,62.8,2.0,82.0,3.0,0.0


#### **Data Quality**
- **Price**: we have outliers, since we have a really big jump from the 4th Quartile and the Max value and we have a slightly left skewed distribution
- **Mileage**: we have outliers, since we have a really big jump from the 4th Quartile and the Max value and we have a left skewed distribution, we also have negative values, so we need to handle these values
- **Tax**: we have negative values, we need to investigate why these values are negative and If they make any sense in this context
- **MPG**: We have negative values, we need to handle these values
- **Engine Size**: We have negative values, needing to handle them
- **PaintQuality%**: We have values above 100%, we need to check this cases and see if they make any kind of sense
- **previousOwners**: We have negative and float values, we need to handle this cases as well
- **hasDamage**: Seems like a constant value feature, we need to check if thats the case


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75973 entries, 0 to 75972
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   carID           75973 non-null  int64  
 1   Brand           74452 non-null  object 
 2   model           74456 non-null  object 
 3   year            74482 non-null  float64
 4   price           75973 non-null  int64  
 5   transmission    74451 non-null  object 
 6   mileage         74510 non-null  float64
 7   fuelType        74462 non-null  object 
 8   tax             68069 non-null  float64
 9   mpg             68047 non-null  float64
 10  engineSize      74457 non-null  float64
 11  paintQuality%   74449 non-null  float64
 12  previousOwners  74423 non-null  float64
 13  hasDamage       74425 non-null  float64
dtypes: float64(8), int64(2), object(4)
memory usage: 8.1+ MB


In [7]:
df.isna().sum()

carID                0
Brand             1521
model             1517
year              1491
price                0
transmission      1522
mileage           1463
fuelType          1511
tax               7904
mpg               7926
engineSize        1516
paintQuality%     1524
previousOwners    1550
hasDamage         1548
dtype: int64

In [8]:
# check for duplicates in car ID
df['carID'].nunique() == df.shape[0]

True

In [9]:
#check for duplicates in the whole dataframe 
df.duplicated().sum()

np.int64(0)

### **Data Preparation**

#### **Checking for constant value features**

In [10]:
# check for constant values features
constant_features = [col for col in df.columns if df[col].nunique() == 1]
print(f'Constant features: {constant_features}')

Constant features: ['hasDamage']


Since hasDamage is a constant feature, we can safely remove it since theres no meaning if its keeped

In [11]:
df= df.drop(columns='hasDamage')

#### **Dealing with missing values**

First, lets check if we have any entries with every feature missing

In [12]:
# create a mask for rows where all columns except 'carID' and 'price' are NaN
mask = df.drop(columns=['carID', 'price']).isna().all(axis=1)
df_missing_rows = df[mask]

df_missing_rows.size

0

We don't have any row that is completely null

##### **Let's now check the percentages of missing values for each column**

In [13]:
# check the percentage of missing values for each column
missing_percent = df.isna().mean() * 100
missing_percent = missing_percent[missing_percent > 0].sort_values(ascending=False)
df_missing_percent = pd.DataFrame({'column': missing_percent.index, 'missing_percent': missing_percent.values})
df_missing_percent

Unnamed: 0,column,missing_percent
0,mpg,10.432654
1,tax,10.403696
2,previousOwners,2.040198
3,paintQuality%,2.005976
4,transmission,2.003343
5,Brand,2.002027
6,model,1.996762
7,engineSize,1.995446
8,fuelType,1.988864
9,year,1.962539


We have some percentage of missing values but nothing too bad.

#### **Fill Missing Values in Numeric Features**

For the numeric features, we will perform median imputation first, since the proportion of missing data is relatively small (2–10%). 

In [14]:
# fill missing values in numeric features with median
num_cols = ['year', 'mileage', 'tax', 'mpg', 'engineSize', 'paintQuality%', 'previousOwners']
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

#### **Fill Missing Values in Categorical Features**

For the categorical variables we will use a Random Forest Classifier.  
We will train this RFC with the non null entries and use them to predict the null entries.  
We will do this sequentially in this order:  
Brand -> Model -> fuelType -> transmission

In [46]:
def impute_categorical_rfc(df, target_col, ignore_cols=['carID', 'price'], n_estimators=200):
    """
    Impute missing values in a categorical column using a Random Forest Classifier.

    Parameters:
        df (pd.DataFrame): The dataframe containing missing values.
        target_col (str): Column to impute.
        ignore_cols (list): Columns to exclude from predictors.
        n_estimators (int): Number of trees for the Random Forest.

    Returns:
        pd.DataFrame: DataFrame with imputed column.
    """
    #split into rows with and without missing target
    df_notnull = df[df[target_col].notnull()]
    df_null = df[df[target_col].isnull()]

    #print(df_null.head())
    
    # predictors: list of all columns except target and ignored ones
    predictors = df.columns.drop([target_col] + ignore_cols)
    #print(predictors)
    
    #encode predictors, creates a column for every value of the target col and sets True/False for each row
    X_train = pd.get_dummies(df_notnull[predictors], drop_first=True)
    print(X_train)
    y_train = df_notnull[target_col]
    
    # train RFC
    rfc = RandomForestClassifier(n_estimators=n_estimators, random_state=42, n_jobs=-1)
    rfc.fit(X_train, y_train)
    
    # predict missing
    if not df_null.empty:
        X_null = pd.get_dummies(df_null[predictors], drop_first=True)
        # we need to reindex the columns of X_null so we have the same columns as X_train
        # if therese a column in X_train that is not in X_null, we fill it with 0
        # if theres a column in X_null that is not in X_train, it will random guess
        X_null = X_null.reindex(columns=X_train.columns, fill_value=0)
        
        df.loc[df[target_col].isnull(), target_col] = rfc.predict(X_null)
    
    print(f"Imputed '{target_col}' using Random Forest with {len(X_train)} training rows.")
    return df

In [47]:
impute_categorical_rfc(df, 'Brand')

         year  mileage    tax        mpg  engineSize  paintQuality%  \
0      2016.0  28421.0  145.0  11.417268         2.0           63.0   
1      2019.0   4589.0  145.0  47.900000         1.5           50.0   
2      2019.0   3624.0  145.0  40.900000         1.5           56.0   
3      2018.0   9102.0  145.0  65.700000         1.0           50.0   
4      2019.0   1000.0  145.0  42.800000         1.5           97.0   
...       ...      ...    ...        ...         ...            ...   
75968  2015.0  14480.0  125.0  53.300000         2.0           78.0   
75969  2013.0  52134.0  200.0  47.900000         2.0           38.0   
75970  2017.0  11304.0  145.0  67.000000         1.0           57.0   
75971  2015.0  69072.0  125.0  60.100000         2.0           74.0   
75972  2018.0  16709.0  145.0  64.200000         1.1           38.0   

       previousOwners  model_ 1 SERIES  model_ 1 Serie  model_ 1 Series  ...  \
0            4.000000            False           False            F

Unnamed: 0,carID,Brand,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,paintQuality%,previousOwners
0,69512,VW,Golf,2016.0,22290,Semi-Auto,28421.0,Petrol,145.0,11.417268,2.0,63.0,4.000000
1,53000,Toyota,Yaris,2019.0,13790,Manual,4589.0,Petrol,145.0,47.900000,1.5,50.0,1.000000
2,6366,Audi,Q2,2019.0,24990,Semi-Auto,3624.0,Petrol,145.0,40.900000,1.5,56.0,4.000000
3,29021,Ford,FIESTA,2018.0,12500,anual,9102.0,Petrol,145.0,65.700000,1.0,50.0,-2.340306
4,10062,BMW,2 Series,2019.0,22995,Manual,1000.0,Petrol,145.0,42.800000,1.5,97.0,3.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
75968,37194,Mercedes,C Class,2015.0,13498,Manual,14480.0,etrol,125.0,53.300000,2.0,78.0,0.000000
75969,6265,Audi,Q3,2013.0,12495,Semi-Auto,52134.0,Diesel,200.0,47.900000,2.0,38.0,2.000000
75970,54886,Toyota,Aygo,2017.0,8399,Automatic,11304.0,Petrol,145.0,67.000000,1.0,57.0,3.000000
75971,860,Audi,Q3,2015.0,12990,Manual,69072.0,iesel,125.0,60.100000,2.0,74.0,2.000000


In [48]:
'''cat_cols = ['Brand', 'model', 'fuelType', 'transmission']

for col in cat_cols:
    df = impute_categorical_rfc(df, col)
'''

"cat_cols = ['Brand', 'model', 'fuelType', 'transmission']\n\nfor col in cat_cols:\n    df = impute_categorical_rfc(df, col)\n"