Practice with cleaning data from German car posting sales.

In [1]:
import numpy as np
import pandas as pd

In [2]:
autos = pd.read_csv('autos.csv', encoding='Latin-1')

In [3]:
autos.info()
autos.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
dateCrawled            50000 non-null object
name                   50000 non-null object
seller                 50000 non-null object
offerType              50000 non-null object
price                  50000 non-null object
abtest                 50000 non-null object
vehicleType            44905 non-null object
yearOfRegistration     50000 non-null int64
gearbox                47320 non-null object
powerPS                50000 non-null int64
model                  47242 non-null object
odometer               50000 non-null object
monthOfRegistration    50000 non-null int64
fuelType               45518 non-null object
brand                  50000 non-null object
notRepairedDamage      40171 non-null object
dateCreated            50000 non-null object
nrOfPictures           50000 non-null int64
postalCode             50000 non-null int64
lastSeen               50000 non-null obj

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,odometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode,lastSeen
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,privat,Angebot,"$5,000",control,bus,2004,manuell,158,andere,"150,000km",3,lpg,peugeot,nein,2016-03-26 00:00:00,0,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,privat,Angebot,"$8,500",control,limousine,1997,automatik,286,7er,"150,000km",6,benzin,bmw,nein,2016-04-04 00:00:00,0,71034,2016-04-06 14:45:08
2,2016-03-26 18:57:24,Volkswagen_Golf_1.6_United,privat,Angebot,"$8,990",test,limousine,2009,manuell,102,golf,"70,000km",7,benzin,volkswagen,nein,2016-03-26 00:00:00,0,35394,2016-04-06 20:15:37
3,2016-03-12 16:58:10,Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...,privat,Angebot,"$4,350",control,kleinwagen,2007,automatik,71,fortwo,"70,000km",6,benzin,smart,nein,2016-03-12 00:00:00,0,33729,2016-03-15 03:16:28
4,2016-04-01 14:38:50,Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...,privat,Angebot,"$1,350",test,kombi,2003,manuell,0,focus,"150,000km",7,benzin,ford,nein,2016-04-01 00:00:00,0,39218,2016-04-01 14:38:50


In [4]:
# Update col names to snakecase and succinct
autos.rename({'dateCrawled':'date_crawled','offerType':'offer_type','vehicleType':'vehicle_type',
              'yearOfRegistration':'registration_year','powerPS':'power_ps','monthOfRegistration':'registration_month',
              'fuelType':'fuel_type','notRepairedDamage':'unrepaired_damage','dateCreated':'ad_created',
              'nrOfPictures':'num_of_pictures','postalCode':'postal_code','lastSeen':'last_seen'},
             axis=1, inplace=True)


In [5]:
# Examine stats on columns to look for lack of structure
autos.describe(include='all')
autos.info()
#cols to convert to int/float = price, odometer
#num_of_pics = no values (all zero)
#seller and offer_type all the same
#unreparied_damage = contains german('nein')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
date_crawled          50000 non-null object
name                  50000 non-null object
seller                50000 non-null object
offer_type            50000 non-null object
price                 50000 non-null object
abtest                50000 non-null object
vehicle_type          44905 non-null object
registration_year     50000 non-null int64
gearbox               47320 non-null object
power_ps              50000 non-null int64
model                 47242 non-null object
odometer              50000 non-null object
registration_month    50000 non-null int64
fuel_type             45518 non-null object
brand                 50000 non-null object
unrepaired_damage     40171 non-null object
ad_created            50000 non-null object
num_of_pictures       50000 non-null int64
postal_code           50000 non-null int64
last_seen             50000 non-null object
dtypes: int64(5)

In [6]:
#Change price and odometer values to numeric
autos.rename({'price':'price_dollar','odometer':'odometer_km'},
             axis=1,inplace=True)
autos['price_dollar'] = (autos['price_dollar'].str.replace('$','')
                                            .str.replace(',','')
                                            .str.replace("'","")
                                            .astype(int)
                        )
autos['odometer_km'] = (autos['odometer_km'].str.replace('km','')
                                            .str.replace(',','')
                                            .str.replace("'","")
                                            .astype(int)
                       )

In [7]:
# Remove cols with no values or little relevance
autos.drop(['seller','offer_type','num_of_pictures'],axis=1)

Unnamed: 0,date_crawled,name,price_dollar,abtest,vehicle_type,registration_year,gearbox,power_ps,model,odometer_km,registration_month,fuel_type,brand,unrepaired_damage,ad_created,postal_code,last_seen
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,5000,control,bus,2004,manuell,158,andere,150000,3,lpg,peugeot,nein,2016-03-26 00:00:00,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,8500,control,limousine,1997,automatik,286,7er,150000,6,benzin,bmw,nein,2016-04-04 00:00:00,71034,2016-04-06 14:45:08
2,2016-03-26 18:57:24,Volkswagen_Golf_1.6_United,8990,test,limousine,2009,manuell,102,golf,70000,7,benzin,volkswagen,nein,2016-03-26 00:00:00,35394,2016-04-06 20:15:37
3,2016-03-12 16:58:10,Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...,4350,control,kleinwagen,2007,automatik,71,fortwo,70000,6,benzin,smart,nein,2016-03-12 00:00:00,33729,2016-03-15 03:16:28
4,2016-04-01 14:38:50,Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...,1350,test,kombi,2003,manuell,0,focus,150000,7,benzin,ford,nein,2016-04-01 00:00:00,39218,2016-04-01 14:38:50
5,2016-03-21 13:47:45,Chrysler_Grand_Voyager_2.8_CRD_Aut.Limited_Sto...,7900,test,bus,2006,automatik,150,voyager,150000,4,diesel,chrysler,,2016-03-21 00:00:00,22962,2016-04-06 09:45:21
6,2016-03-20 17:55:21,VW_Golf_III_GT_Special_Electronic_Green_Metall...,300,test,limousine,1995,manuell,90,golf,150000,8,benzin,volkswagen,,2016-03-20 00:00:00,31535,2016-03-23 02:48:59
7,2016-03-16 18:55:19,Golf_IV_1.9_TDI_90PS,1990,control,limousine,1998,manuell,90,golf,150000,12,diesel,volkswagen,nein,2016-03-16 00:00:00,53474,2016-04-07 03:17:32
8,2016-03-22 16:51:34,Seat_Arosa,250,test,,2000,manuell,0,arosa,150000,10,,seat,nein,2016-03-22 00:00:00,7426,2016-03-26 18:18:10
9,2016-03-16 13:47:02,Renault_Megane_Scenic_1.6e_RT_Klimaanlage,590,control,bus,1997,manuell,90,megane,150000,7,benzin,renault,nein,2016-03-16 00:00:00,15749,2016-04-06 10:46:35


In [8]:
# Examine price and odometer more closely for quality
autos['price_dollar'].describe()

count    5.000000e+04
mean     9.840044e+03
std      4.811044e+05
min      0.000000e+00
25%      1.100000e+03
50%      2.950000e+03
75%      7.200000e+03
max      1.000000e+08
Name: price_dollar, dtype: float64

In [9]:
autos['price_dollar'].value_counts().head()

0       1421
500      781
1500     734
2500     643
1000     639
Name: price_dollar, dtype: int64

In [10]:
autos['odometer_km'].describe()

count     50000.000000
mean     125732.700000
std       40042.211706
min        5000.000000
25%      125000.000000
50%      150000.000000
75%      150000.000000
max      150000.000000
Name: odometer_km, dtype: float64

In [11]:
autos['odometer_km'].value_counts().head()

150000    32424
125000     5170
100000     2169
90000      1757
80000      1436
Name: odometer_km, dtype: int64

Examining the date columns

In [12]:
autos[['date_crawled','ad_created','last_seen']][0:5]

Unnamed: 0,date_crawled,ad_created,last_seen
0,2016-03-26 17:47:46,2016-03-26 00:00:00,2016-04-06 06:45:54
1,2016-04-04 13:38:56,2016-04-04 00:00:00,2016-04-06 14:45:08
2,2016-03-26 18:57:24,2016-03-26 00:00:00,2016-04-06 20:15:37
3,2016-03-12 16:58:10,2016-03-12 00:00:00,2016-03-15 03:16:28
4,2016-04-01 14:38:50,2016-04-01 00:00:00,2016-04-01 14:38:50


In [13]:
# date_crawled distribution
autos['date_crawled'].str[:10].value_counts(normalize=True,
                                           dropna=False).sort_index()

2016-03-05    0.02538
2016-03-06    0.01394
2016-03-07    0.03596
2016-03-08    0.03330
2016-03-09    0.03322
2016-03-10    0.03212
2016-03-11    0.03248
2016-03-12    0.03678
2016-03-13    0.01556
2016-03-14    0.03662
2016-03-15    0.03398
2016-03-16    0.02950
2016-03-17    0.03152
2016-03-18    0.01306
2016-03-19    0.03490
2016-03-20    0.03782
2016-03-21    0.03752
2016-03-22    0.03294
2016-03-23    0.03238
2016-03-24    0.02910
2016-03-25    0.03174
2016-03-26    0.03248
2016-03-27    0.03104
2016-03-28    0.03484
2016-03-29    0.03418
2016-03-30    0.03362
2016-03-31    0.03192
2016-04-01    0.03380
2016-04-02    0.03540
2016-04-03    0.03868
2016-04-04    0.03652
2016-04-05    0.01310
2016-04-06    0.00318
2016-04-07    0.00142
Name: date_crawled, dtype: float64

In [14]:
# ad_created distribution
autos['ad_created'].str[:10].value_counts(normalize=True,
                                         dropna=False).sort_index()

2015-06-11    0.00002
2015-08-10    0.00002
2015-09-09    0.00002
2015-11-10    0.00002
2015-12-05    0.00002
2015-12-30    0.00002
2016-01-03    0.00002
2016-01-07    0.00002
2016-01-10    0.00004
2016-01-13    0.00002
2016-01-14    0.00002
2016-01-16    0.00002
2016-01-22    0.00002
2016-01-27    0.00006
2016-01-29    0.00002
2016-02-01    0.00002
2016-02-02    0.00004
2016-02-05    0.00004
2016-02-07    0.00002
2016-02-08    0.00002
2016-02-09    0.00004
2016-02-11    0.00002
2016-02-12    0.00006
2016-02-14    0.00004
2016-02-16    0.00002
2016-02-17    0.00002
2016-02-18    0.00004
2016-02-19    0.00006
2016-02-20    0.00004
2016-02-21    0.00006
               ...   
2016-03-09    0.03324
2016-03-10    0.03186
2016-03-11    0.03278
2016-03-12    0.03662
2016-03-13    0.01692
2016-03-14    0.03522
2016-03-15    0.03374
2016-03-16    0.03000
2016-03-17    0.03120
2016-03-18    0.01372
2016-03-19    0.03384
2016-03-20    0.03786
2016-03-21    0.03772
2016-03-22    0.03280
2016-03-23

In [15]:
# last_seen distribution
autos['last_seen'].str[:10].value_counts(normalize=True,
                                         dropna=False).sort_index()

2016-03-05    0.00108
2016-03-06    0.00442
2016-03-07    0.00536
2016-03-08    0.00760
2016-03-09    0.00986
2016-03-10    0.01076
2016-03-11    0.01252
2016-03-12    0.02382
2016-03-13    0.00898
2016-03-14    0.01280
2016-03-15    0.01588
2016-03-16    0.01644
2016-03-17    0.02792
2016-03-18    0.00742
2016-03-19    0.01574
2016-03-20    0.02070
2016-03-21    0.02074
2016-03-22    0.02158
2016-03-23    0.01858
2016-03-24    0.01956
2016-03-25    0.01920
2016-03-26    0.01696
2016-03-27    0.01602
2016-03-28    0.02086
2016-03-29    0.02234
2016-03-30    0.02484
2016-03-31    0.02384
2016-04-01    0.02310
2016-04-02    0.02490
2016-04-03    0.02536
2016-04-04    0.02462
2016-04-05    0.12428
2016-04-06    0.22100
2016-04-07    0.13092
Name: last_seen, dtype: float64

In [16]:
# registration_year characteristics
autos['registration_year'].describe()

count    50000.000000
mean      2005.073280
std        105.712813
min       1000.000000
25%       1999.000000
50%       2003.000000
75%       2008.000000
max       9999.000000
Name: registration_year, dtype: float64

In [17]:
autos['registration_year'].value_counts(normalize=True)
old_registration = autos['registration_year']

for reg in old_registration:
    if (reg < 1900) | (reg > 2016):
        questionable_reg_indexes = autos[autos['registration_year'] == reg].index
        autos.drop(questionable_reg_indexes,inplace=True)
    else:
        pass
# Some values in registration_year were greater than 2016 or 
# less than 1900. Found and removed those entries from dataframe

In [18]:
autos['registration_year'].describe()

count    48028.00000
mean      2002.80351
std          7.31085
min       1910.00000
25%       1999.00000
50%       2003.00000
75%       2008.00000
max       2016.00000
Name: registration_year, dtype: float64

In [19]:
# Aggregate based on brand of vehicle
unique_brands = autos['brand'].unique()
unique_brands

array(['peugeot', 'bmw', 'volkswagen', 'smart', 'ford', 'chrysler',
       'seat', 'renault', 'mercedes_benz', 'audi', 'sonstige_autos',
       'opel', 'mazda', 'porsche', 'mini', 'toyota', 'dacia', 'nissan',
       'jeep', 'saab', 'volvo', 'mitsubishi', 'jaguar', 'fiat', 'skoda',
       'subaru', 'kia', 'citroen', 'chevrolet', 'hyundai', 'honda',
       'daewoo', 'suzuki', 'trabant', 'land_rover', 'alfa_romeo', 'lada',
       'rover', 'daihatsu', 'lancia'], dtype=object)

In [40]:
# Get average price for top 20 brand vehicles
top_20_brands = autos['brand'].value_counts().head(20).keys()

In [52]:
avg_price_per_brand = {}

for brand in top_20_brands:
    avg_price = round((autos[autos['brand'] == brand]['price_dollar'].sum()) / (len(autos[autos['brand'] == brand])))
    avg_price_per_brand[brand] = int(avg_price)
    
avg_price_per_brand

{'audi': 9094,
 'bmw': 8335,
 'citroen': 44535,
 'fiat': 2712,
 'ford': 7263,
 'hyundai': 5309,
 'mazda': 4011,
 'mercedes_benz': 30317,
 'mini': 10460,
 'nissan': 4665,
 'opel': 5253,
 'peugeot': 3039,
 'renault': 2395,
 'seat': 4296,
 'skoda': 6335,
 'smart': 3543,
 'sonstige_autos': 39622,
 'toyota': 5115,
 'volkswagen': 6516,
 'volvo': 4757}

In [53]:
# Average milage per top 20 brand vehicles
avg_milage_per_brand = {}

for brand in top_20_brands:
    milage = 0.621 * (autos[autos['brand'] == brand]['odometer_km'].sum())
    mean_milage = round(milage / len(autos[autos['brand'] == brand]))
    avg_milage_per_brand[brand] = int(mean_milage)
    
avg_milage_per_brand

{'audi': 80288,
 'bmw': 82242,
 'citroen': 74186,
 'fiat': 72380,
 'ford': 77033,
 'hyundai': 66144,
 'mazda': 77467,
 'mercedes_benz': 81264,
 'mini': 55022,
 'nissan': 73633,
 'opel': 80250,
 'peugeot': 78952,
 'renault': 79602,
 'seat': 75491,
 'skoda': 68903,
 'smart': 61849,
 'sonstige_autos': 54190,
 'toyota': 71856,
 'volkswagen': 79942,
 'volvo': 85919}

In [75]:
# Combine price and milage per top 20 brands
# by creating 2 series and then dataframe of both

price_series = pd.Series(avg_price_per_brand)
milage_series = pd.Series(avg_milage_per_brand)

price_milage_df = pd.DataFrame(price_series, columns=['mean_price'])
price_milage_df['mean_milage'] = milage_series
price_milage_df

Unnamed: 0,mean_price,mean_milage
audi,9094,80288
bmw,8335,82242
citroen,44535,74186
fiat,2712,72380
ford,7263,77033
hyundai,5309,66144
mazda,4011,77467
mercedes_benz,30317,81264
mini,10460,55022
nissan,4665,73633
