In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer
from rapidfuzz import process, fuzz
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import mutual_info_regression
from scipy import stats
from joblib import load
import importlib
import utils
importlib.reload(utils)
import os

In [8]:
df_models= pd.read_csv('../../data/all_car_models.csv')
df_avg_price= pd.read_csv('../../data/avg_model_prices.csv')


In [9]:
df_test= pd.read_csv('../../data/test.csv')

In [10]:
df_test.head()

Unnamed: 0,carID,Brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,paintQuality%,previousOwners,hasDamage
0,89856,Hyundai,I30,2022.878006,Automatic,30700.0,petrol,205.0,41.5,1.6,61.0,3.0,0.0
1,106581,VW,Tiguan,2017.0,Semi-Auto,-48190.655673,Petrol,150.0,38.2,2.0,60.0,2.0,0.0
2,80886,BMW,2 Series,2016.0,Automatic,36792.0,Petrol,125.0,51.4,1.5,94.0,2.0,0.0
3,100174,Opel,Grandland X,2019.0,Manual,5533.0,Petrol,145.0,44.1,1.2,77.0,1.0,0.0
4,81376,BMW,1 Series,2019.0,Semi-Auto,9058.0,Diesel,150.0,51.4,2.0,45.0,4.0,0.0


In [11]:
def preprocess_df(df):

    num_cols = ['year', 'mileage', 'tax', 'mpg', 'engineSize', 'paintQuality%', 'previousOwners']
    cat_cols = ['Brand', 'model', 'fuelType', 'transmission']
    columns_negative= ["previousOwners","mileage","mpg","engineSize","tax"]
    # Canonical values of fuel_type and transmission
    canonical_fuels = ['petrol', 'diesel', 'hybrid', 'electric', 'other']
    canonical_transmissions = ['manual', 'automatic', 'semi-auto', 'other']

    # drop hasDamage
    if 'hasDamage' in df.columns:
        df = df.drop(columns='hasDamage')
    
    # flag engine size
    df.loc[df['engineSize'] <= 0.99, 'engineSize'] = np.nan

    # impute numerical missing values
    df= utils.impute_numerical(df,num_cols)

 
    # impute categorical missing values
    for col in cat_cols:
        df= utils.predict_missing_test(df, target_col=col)



    # fix paint quality
    df.loc[df['paintQuality%'] > 100, 'paintQuality%'] = 100

    # Apply abs function to each column in columns_negative
    for col in columns_negative:
        df[col] = df[col].abs()

    # fix decimals
    df[['mileage','paintQuality%','year','previousOwners']] = (
    df[['mileage','paintQuality%','year','previousOwners']]
    .round()
    .astype('Int64')
    )
    df[['tax', 'mpg']] = df[['tax', 'mpg']].round(2)
    df[['engineSize']] = df[['engineSize']].round(1)


    # fixing typos
    # standardize

    # standardize
    df['brand_clean'] = df['Brand'].apply(utils.clean_text)
    df['model_clean'] = df['model'].apply(utils.clean_text)
    df_models['brand_clean'] = df_models['brand'].apply(utils.clean_text)
    df_models['model_clean'] = df_models['model'].apply(utils.clean_text)

    # apply fix functions
    df['model_fixed'] = df.apply(utils.correct_model, axis=1, df_models=df_models)
    df['brand_fixed'] = df.apply(utils.correct_brand, axis=1, df_models=df_models)
  

    # drop columns brand clean, model_clean, brand, model
    df.drop(columns=['brand_clean', 'model_clean', 'Brand', 'model'], inplace= True)

    df = df.rename(columns={'brand_fixed': 'brand', 'model_fixed': 'model'})

    # harmonize brand
    df = utils.harmonize_brand(df)




    # Normalize column strings
    df['fuel_type_clean'] = (df['fuelType']
        .astype(str)
        .str.strip()
        .str.lower()
    )
    df['transmission_clean'] = (df['transmission']
        .astype(str)
        .str.strip()
        .str.lower()
    )


    # apply fix functions
    df['fuel_type_fixed'] = utils.fuzzy_match_column(df['fuel_type_clean'], canonical_fuels)
    df['transmission_fixed'] = utils.fuzzy_match_column(df['transmission_clean'], canonical_transmissions)





    # drop and rename
    df.drop(columns= ['fuelType', 'fuel_type_clean', 'transmission','transmission_clean'], inplace= True)

    df = df.rename(columns={'fuel_type_fixed': 'fuel_type', 'transmission_fixed': 'transmission'})

    # rename column cardid to car_id and brand fixed to brand
    
    
    df.drop(columns=['previousOwners'], inplace=True)

    df['fuel_type'] = df['fuel_type'].replace(['electric', 'hybrid'], 'other')

    # turn year into age of the car using datetime
    df['age'] = datetime.now().year - df['year']
    df.drop('year', axis=1, inplace=True)

    # set 'other' as NaN values in transmission
    df['transmission'] = df['transmission'].replace('other', np.nan)

    # fill NaN values in transmission with the RFC model
    df = utils.predict_missing_test(df, target_col='transmission')

    print(df.head())
    #drop noise
    df.drop(columns=['carID','paintQuality%', 'tax'], inplace=True)

    df = df.rename(columns={'engineSize': 'engine_size'})

    # encode
    #df = utils.encode_test(df)



    return df

In [12]:
df_test = preprocess_df(df_test)

Test: Imputed 'Brand' with 649 missing values
Test: Imputed 'model' with 650 missing values
Test: Imputed 'fuelType' with 656 missing values
Test: Imputed 'transmission' with 623 missing values
Test: Imputed 'transmission' with 357 missing values
    carID  mileage    tax   mpg  engineSize  paintQuality%        model  \
0   89856    30700  205.0  41.5         1.6             61          i30   
1  106581    48191  150.0  38.2         2.0             60       tiguan   
2   80886    36792  125.0  51.4         1.5             94     2 series   
3  100174     5533  145.0  44.1         1.2             77  grandland x   
4   81376     9058  150.0  51.4         2.0             45     1 series   

        brand fuel_type transmission  age  
0     hyundai    petrol    automatic    2  
1  volkswagen    petrol    semi-auto    8  
2         bmw    petrol    automatic    9  
3        opel    petrol       manual    6  
4         bmw    diesel    semi-auto    6  


In [13]:
avg_price_path = os.path.join('..','..', 'data', 'avg_model_prices.csv')
if os.path.exists(avg_price_path):
    avg_lookup = pd.read_csv(avg_price_path)
    if {'brand', 'model', 'avg_price'}.issubset(avg_lookup.columns):
        df_test = df_test.merge(avg_lookup, on=['brand', 'model'], how='left')
        df_test['avg_price'] = df_test['avg_price'].fillna(0)
    else:
        df_test['avg_price'] = 0
else:
    df_test['avg_price'] = 0


In [14]:
df_test.to_csv('test_processed.csv', index=False)