In [1]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import re
from scipy import stats
from datetime import datetime

matplotlib.rcParams['figure.figsize'] = (10, 7)

In [2]:
class DataLoader():

    def __init__(self, path):
        self.path = path

    @staticmethod
    def _fill_empty_with_nan(df):
        return df.replace({None: np.nan})
    
    @staticmethod
    def _rename_columns(df):
        rename_dict = dict()
        for col in df.columns:
            if "-" in col:
                rename_dict[col] = col.replace('-', '_')
        rename_dict = {**rename_dict, **{'dellerswebsite': 'sellerwebsite'}}    
        return df.rename(columns=rename_dict)
    
    @staticmethod
    def _drop_unused_columns(df):
        cols_to_drop = [
            # 'model', 
            'trim', 
            'body_type', 
            'generation', 
            'manufactured_year', 
            'service_history', 
            'vrm', 
            'co2Emissions',
            'adverttitle',
            'advert',
            'mainimage',
            'images',
            'sellerpostcode',
            'sellerwebsite',
            'year', 
            'todaysdate', 
            'owners', 
            'priceindicators', 
            'mileage'
        ]
        return df.drop(columns=cols_to_drop)


    def load_parquet(self, use_nullable=False):
        df = self._rename_columns(
                self._fill_empty_with_nan(
                    pd.read_parquet(self.path, use_nullable_dtypes=use_nullable)
                    )
                )

        return self._drop_unused_columns(df)

In [3]:
df = DataLoader(path="car_data.parquet.gzip").load_parquet()

In [4]:
class DataCleaner():

    def __init__(self, df):
        self.df = df

    @staticmethod
    def clean_price(x):
        try:
            currency_symbols = ['£', '$', '€']
            for symbol in currency_symbols:
                x = x.replace(symbol, '')
            x = x.replace(',', '')
            return int(x)
        except:
            return x

    
    def remove_skewed_outliers(self, col):
        """
        Remove based on IQR range
        - A multiplier of 1.5 represents a +- 2.7 SD. +- 3 SD for a Gaussian is 99% of the data 
        """
        x = np.array(self.df[col])
        q3, q1 = np.percentile(x, [75 ,25])
        iqr_const = (q3 - q1) * 1.5
        upper_bound = q3 + iqr_const
        lower_bound = q1 - iqr_const

        mask = (x < lower_bound) & (x > upper_bound)
        return df[mask]

    def clean_data(self):
        ## Clean target: Price
        self.df['price'] = [self.clean_price(x) for x in self.df['price']]
        return self.remove_skewed_outliers('price')

In [5]:
data_cleaner = DataCleaner(df)
df['price'] = [data_cleaner.clean_price(x) for x in df.price]
price = df['price']

In [6]:
q3, q1 = np.percentile(price, [75 ,25])
iqr_const = (q3 - q1) * 1.5
upper_bound = q3 + iqr_const
lower_bound = q1 - iqr_const

In [7]:
print(f"Q1: {q1}")
print(f"Q3: {q3}")
print(f"IQR: {q3 - q1}")
print(f"lower: {lower_bound}")
print(f"upper: {upper_bound}")

Q1: 6995.0
Q3: 22491.0
IQR: 15496.0
lower: -16249.0
upper: 45735.0


In [8]:
og_len = len(df)
mask = [False if x < lower_bound or x > upper_bound else True for x in df['price']]
df = df[mask]

In [9]:
rows_removed = og_len - len(df)
rows_removed / og_len

0.06393830133535566