In [333]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

In [334]:
from matplotlib import rcParams
rcParams["figure.figsize"] = 20, 6
sns.set(font_scale=1.5)

In [335]:
DF_PATH = os.getcwd() + "/csv/used_cars.csv"
df = pd.read_csv(DF_PATH, sep=";", encoding = "ISO-8859-1")
df.head()

Unnamed: 0,name,year,mileage,engine_power_kW,engine_power_hp,fuel,consumption,ecology,gearbox,price,currency,country,capacity,other,engine_size,nr_of_accidents,color_exterior,color_interior
0,Daewoo Rezzo,2003,200000 km,89 kW,121 hp,Gasoline,9 l/100 km,236 g CO2/km,Manual,19900,EUR,Germany,,,,,,
1,Volkswagen Golf,1994,222275 km,55 kW,75 hp,Gasoline,0 l/100 km,0 g CO2/km,Manual,20000,EUR,Germany,,,,,,
2,Ford Fiesta,2001,153000 km,44 kW,60 hp,Gasoline,7.3 l/100 km,171 g CO2/km,Manual,29000,EUR,Germany,,,,,,
3,Volkswagen Polo,1999,154000 km,44 kW,60 hp,Gasoline,6.3 l/100 km,151 g CO2/km,Manual,35000,EUR,Germany,,,,,,
4,Nissan Micra,1998,140000 km,40 kW,54 hp,Gasoline,6 l/100 km,152 g CO2/km,Manual,38000,EUR,Germany,,,,,,


In [336]:
df.shape

(1816, 18)

Lowercase cars' `name`.

In [337]:
df.name = df.name.str.lower()

Fill missing years with 0 and convert to int.

In [338]:
df.loc[df.year == "-", ["year"]] = 0
df.loc[df.year == '0', ["year"]] = 0

In [339]:
df.year = df.year.astype(int)

Substitute missing values for `ecology` and `consumption`  with nan values.

In [340]:
df = df[df.engine_power_kW != "#NAZWA?"]
df.loc[df.ecology == '-/- (CO2/km)', ["ecology"]] = np.nan
df.loc[df.consumption == '-/- (l/100 km)', ["consumption"]] = np.nan

Convert `engine_power_kW`, `engine_power_hp`, `consumption` and `ecology` to float.

Rename columns.

In [341]:
df.engine_power_kW = df.engine_power_kW.str.replace(" kW", "", regex=False).astype(float)
df.engine_power_hp = df.engine_power_hp.str.replace(" hp", "", regex=False).astype(float)
df.ecology = df.ecology.str.replace(" g CO2/km", "", regex=False).astype(float)
df.consumption = df.consumption.str.replace(" l/100 km", "", regex=False).astype(float)
df.rename(columns={"ecology": "ecology_(gCO2/km)", "consumption":"consumption_(l/100 km)", "fuel":"fuel_type"}, inplace=True)

Convert miles in `mileage` to km. Convert `mileage` to float.

In [342]:
miles_to_km = 1.609344

miles_df = df[df.mileage.str.contains("mi")].mileage.str.replace(r"[a-z]+", "")
miles_df = round(miles_df.str.replace(',', "", regex=False).astype(float) * miles_to_km, 0)

km_df = df[~df.mileage.str.contains("mi")].mileage.str.replace(r"[a-z]+", "")
km_df = km_df.str.replace(',', "", regex=False).astype(float)

contains_miles = df.mileage.str.contains("mi")
df.loc[contains_miles, ['mileage']] = miles_df

contains_miles = ~contains_miles
df.loc[contains_miles, ['mileage']] = km_df

Convert all prices from original currencies to EUR.

In [343]:
from currency_converter import CurrencyConverter
c = CurrencyConverter()
currencies = df.currency.unique()
# currencies = [c for c in currencies if c != "EUR"]

df.price = df.price.str.replace(",", ".").astype(float)

for currency in currencies:
    price = df[df.currency == currency].price
    curr = [round(c.convert(p, currency, "EUR"), 2) for p in (list(price))]
    df.loc[df.currency == currency, "price_eur"] = curr

df = df.drop(columns=["price", "currency"])

Combine `capacity` and `engine_size` into one column, clean and convert to int.

In [344]:
df.engine_size = df.engine_size.combine_first(df.capacity)

In [345]:
df.engine_size = df.engine_size.str.replace(" cc", "", regex=False)
df.engine_size = df.engine_size.str.replace(" cm33", "", regex=False)
df.engine_size = df.engine_size.str.replace(",", "", regex=False)
df.engine_size = df.engine_size.fillna(value=0)
df.engine_size = df.engine_size.astype(int)

In [346]:
df.drop(columns=["capacity", 'other'], inplace=True)

Change NaN in object columns to `"unknown"`.

In [347]:
df.fuel_type.fillna("unknown", inplace=True)
df.gearbox.fillna("unknown", inplace=True)
df.color_exterior.fillna("unknown", inplace=True)
df.color_interior.fillna("unknown", inplace=True)

In [348]:
df.dtypes

name                       object
year                        int32
mileage                    object
engine_power_kW           float64
engine_power_hp           float64
fuel_type                  object
consumption_(l/100 km)    float64
ecology_(gCO2/km)         float64
gearbox                    object
country                    object
engine_size                 int32
nr_of_accidents           float64
color_exterior             object
color_interior             object
price_eur                 float64
dtype: object

Add, analyze and clean column `mark`.

In [349]:
marks = []
for i in range(len(df)):
    name = df.iloc[i][0]
    name = name.lower().replace(".", "").split(" ")
    for word in name:
        if not word.isdigit():
            marks.append(word)
            break

In [350]:
df.insert(1, 'mark', marks)

In [351]:
incorrect_vals = ['very', 'low', 'green', 'lag', 'new', 'case', 'wiola', 'land',
                 'door', 'reluctant', 'indian', 'tmt', 'hamm', 'atlas']man
# non_car_manufacturer = ['jcb', 'kverneland' goldhofer laverda kobelco  bomag fendt
# merlo  kubota bobcat manitou komatsu caterpillar terex takeuchi john wacker case
# ] carnehl vaderstad skuter schaffer iveco
# delete wiola
# scion -> toyota
# ds -> citroen?


In [331]:
# df.loc[df.mark.isin(incorrect_vals)]["name"]
df.loc[df.mark.str.contains("cooper") & df.name.str.contains("mini"), 'mark'] = "cooper"
df.loc[df.mark == "land", 'mark'] = "rover"
df.loc[df.mark == "door", 'mark'] = "rover"


# df.loc[df.mark == "mini"]["name"]

In [374]:
df.loc[df.mark == "case"]["name"]

744     case ih ct5080  pierwszy wâ³aâciciel / bardzo...
775                    case 888  koparka koâ³owa case 888
850                                       case ih jxu 105
886                                       case ih mxu 135
892                                      case ih jx95 4wd
909                                    case ih maxxum 125
917                                  case ih farmall 105u
1016                                            case 821e
Name: name, dtype: object

In [303]:
df.mark.unique()

array(['daewoo', 'volkswagen', 'ford', 'nissan', 'opel', 'mitsubishi',
       'renault', 'audi', 'fiat', 'bmw', 'saab', 'skoda', 'mercedes',
       'lancia', 'seat', 'toyota', 'volvo', 'peugeot', 'mazda', 'citroen',
       'kia', 'land', 'chevrolet', 'smart', 'mini', 'hyundai', 'dacia',
       'suzuki', 'honda', 'iveco', 'daihatsu', 'alfa', 'porsche',
       'bentley', 'maserati', 'lexus', 'jeep', 'maybach', 'infiniti',
       'triumph', 'wacker', 'case', 'vaderstad', 'skuter', 'schaffer',
       'wiola', 'chrysler', 'jcb', 'doosan', 'caterpillar', 'terex',
       'komatsu', 'man', 'carnehl', 'massey', 'claas', 'takeuchi', 'john',
       'manitou', 'bobcat', 'kubota', 'mccormick', 'merlo', 'new',
       'fendt', 'hitachi', 'bomag', 'atlas', 'hamm', 'kobelco',
       'lamborghini', 'laverda', 'ssangyong', 'mg', 'tmt', 'goldhofer',
       'jaguar', 'kymco', 'lag', 'stokota', 'yamaha', 'cf', 'can-am',
       'indian', 'kverneland', 'green', 'low', 'vauxhall', 'mgtf', 'very',
       'reluc

In [302]:
df.mark.value_counts()[-10:]

ferrari        1
lamborghini    1
very           1
low            1
takeuchi       1
mccormick      1
schaffer       1
late           1
green          1
lag            1
Name: mark, dtype: int64

In [285]:
df.loc[df.mark.str.contains("oda"), 'mark'] = "skoda"
df.loc[df.mark.str.contains("skodia"), 'mark'] = "skoda"
df.loc[df.mark.str.contains("citr"), 'mark'] = "citroen"
df.loc[df.mark.str.contains("mercedes-benz"), 'mark'] = "mercedes"
df.loc[df.mark.str.contains("vw"), 'mark'] = "volkswagen"
df.loc[df.mark.str.contains("derstad"), 'mark'] = "vaderstad"
# df.loc[df.mark.str.contains("vw"), 'mark'] = "volkswagen"

In [250]:
df.to_csv("csv/new_csv/cleaned_data.csv", index=False)