In [1]:
import pandas as pd
import numpy as np

In [2]:
autos = pd.read_csv('autos.csv', encoding='Latin-1')

In [3]:
autos.head()

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,odometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode,lastSeen
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,privat,Angebot,"$5,000",control,bus,2004,manuell,158,andere,"150,000km",3,lpg,peugeot,nein,2016-03-26 00:00:00,0,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,privat,Angebot,"$8,500",control,limousine,1997,automatik,286,7er,"150,000km",6,benzin,bmw,nein,2016-04-04 00:00:00,0,71034,2016-04-06 14:45:08
2,2016-03-26 18:57:24,Volkswagen_Golf_1.6_United,privat,Angebot,"$8,990",test,limousine,2009,manuell,102,golf,"70,000km",7,benzin,volkswagen,nein,2016-03-26 00:00:00,0,35394,2016-04-06 20:15:37
3,2016-03-12 16:58:10,Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...,privat,Angebot,"$4,350",control,kleinwagen,2007,automatik,71,fortwo,"70,000km",6,benzin,smart,nein,2016-03-12 00:00:00,0,33729,2016-03-15 03:16:28
4,2016-04-01 14:38:50,Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...,privat,Angebot,"$1,350",test,kombi,2003,manuell,0,focus,"150,000km",7,benzin,ford,nein,2016-04-01 00:00:00,0,39218,2016-04-01 14:38:50


In [4]:
autos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
dateCrawled            50000 non-null object
name                   50000 non-null object
seller                 50000 non-null object
offerType              50000 non-null object
price                  50000 non-null object
abtest                 50000 non-null object
vehicleType            44905 non-null object
yearOfRegistration     50000 non-null int64
gearbox                47320 non-null object
powerPS                50000 non-null int64
model                  47242 non-null object
odometer               50000 non-null object
monthOfRegistration    50000 non-null int64
fuelType               45518 non-null object
brand                  50000 non-null object
notRepairedDamage      40171 non-null object
dateCreated            50000 non-null object
nrOfPictures           50000 non-null int64
postalCode             50000 non-null int64
lastSeen               50000 non-null obj

After quickly looking over the head of the csv file, it appears that we will have to do some cleaning up of the column names, which are not uniform in terms of capitalization. Additionally, the `.info()` method shows us that there are a few columns with null values that we will have to deal with - 'vehicleType', 'gearbox', 'model', 'fuelType', and 'notRepairedDamage'.

Quick inspection of some of the data types reveals columns containing string objects that actually represent numerical values (i.e. price, odometer). We'll have to remove any string characters and convert these to numerical values. Additionally, some columns contain non-English words, which we may have to clean up to improve readibility.

Let's start by renaming the columns using snake case.

In [5]:
autos.columns

Index(['dateCrawled', 'name', 'seller', 'offerType', 'price', 'abtest',
       'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model',
       'odometer', 'monthOfRegistration', 'fuelType', 'brand',
       'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode',
       'lastSeen'],
      dtype='object')

In [6]:
autos.rename({'dateCrawled': 'date_crawled', 'offerType': 'offer_type', 'abtest': 'ab_test', 'vehicleType': 'vehicle_type', 
                          'yearOfRegistration': 'registration_year', 'gearbox': 'gear_box', 'powerPS': 'power_ps', 'monthOfRegistration': 'registration_month', 
                          'fuelType': 'fuel_type', 'notRepairedDamage': 'unrepaired_damage', 'dateCreated': 'ad_created', 'nrOfPictures': 'num_pics', 
                          'postalCode': 'postal_code', 'lastSeen': 'last_seen'}, axis=1, inplace=True)

In [7]:
autos.head()

Unnamed: 0,date_crawled,name,seller,offer_type,price,ab_test,vehicle_type,registration_year,gear_box,power_ps,model,odometer,registration_month,fuel_type,brand,unrepaired_damage,ad_created,num_pics,postal_code,last_seen
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,privat,Angebot,"$5,000",control,bus,2004,manuell,158,andere,"150,000km",3,lpg,peugeot,nein,2016-03-26 00:00:00,0,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,privat,Angebot,"$8,500",control,limousine,1997,automatik,286,7er,"150,000km",6,benzin,bmw,nein,2016-04-04 00:00:00,0,71034,2016-04-06 14:45:08
2,2016-03-26 18:57:24,Volkswagen_Golf_1.6_United,privat,Angebot,"$8,990",test,limousine,2009,manuell,102,golf,"70,000km",7,benzin,volkswagen,nein,2016-03-26 00:00:00,0,35394,2016-04-06 20:15:37
3,2016-03-12 16:58:10,Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...,privat,Angebot,"$4,350",control,kleinwagen,2007,automatik,71,fortwo,"70,000km",6,benzin,smart,nein,2016-03-12 00:00:00,0,33729,2016-03-15 03:16:28
4,2016-04-01 14:38:50,Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...,privat,Angebot,"$1,350",test,kombi,2003,manuell,0,focus,"150,000km",7,benzin,ford,nein,2016-04-01 00:00:00,0,39218,2016-04-01 14:38:50


Now, all columns are in snake-case format and are a bit more descriptive in terms of the data that each represents. Let's continue with the data cleaning process and determine which columns contain numerical data.

In [8]:
autos.describe(include='all')

Unnamed: 0,date_crawled,name,seller,offer_type,price,ab_test,vehicle_type,registration_year,gear_box,power_ps,model,odometer,registration_month,fuel_type,brand,unrepaired_damage,ad_created,num_pics,postal_code,last_seen
count,50000,50000,50000,50000,50000,50000,44905,50000.0,47320,50000.0,47242,50000,50000.0,45518,50000,40171,50000,50000.0,50000.0,50000
unique,48213,38754,2,2,2357,2,8,,2,,245,13,,7,40,2,76,,,39481
top,2016-03-25 19:57:10,Ford_Fiesta,privat,Angebot,$0,test,limousine,,manuell,,golf,"150,000km",,benzin,volkswagen,nein,2016-04-03 00:00:00,,,2016-04-07 06:17:27
freq,3,78,49999,49999,1421,25756,12859,,36993,,4024,32424,,30107,10687,35232,1946,,,8
mean,,,,,,,,2005.07328,,116.35592,,,5.72336,,,,,0.0,50813.6273,
std,,,,,,,,105.712813,,209.216627,,,3.711984,,,,,0.0,25779.747957,
min,,,,,,,,1000.0,,0.0,,,0.0,,,,,0.0,1067.0,
25%,,,,,,,,1999.0,,70.0,,,3.0,,,,,0.0,30451.0,
50%,,,,,,,,2003.0,,105.0,,,6.0,,,,,0.0,49577.0,
75%,,,,,,,,2008.0,,150.0,,,9.0,,,,,0.0,71540.0,


The `.decscribe()` method provides us with some more insight as to which columns contain only 1-2 unique values and which columns contain information that may need to be converted to numerical data.

Possible columns to drop because they only contain 1-2 unique values: 'seller', 'offer_type', 'ab_test', 'gear_box', 'unrepaired_damage'
Columns with information that needs to be converted to numerical data: 'price', 'odometer'

Let's looks closer at the poossible columns to drop to better understand the information that they may provide for analysis.

In [9]:
autos.seller.value_counts()

privat        49999
gewerblich        1
Name: seller, dtype: int64

The 'seller' column only contains 2 unique values, with all but 1 being private. Since almost all of the car sales have been private, we can drop this column as it doesn't add any interesting information for analysis.

In [10]:
autos.drop('seller', axis=1, inplace=True)

In [11]:
autos.offer_type.value_counts()

Angebot    49999
Gesuch         1
Name: offer_type, dtype: int64

Similar to the 'seller' column, the 'offer_type' column only contains 2 different values. All but 1 value is 'Angebot', which is German for 'Offer'. We can also drop this column as it does not provide much insight for future analysis.

In [12]:
autos.drop('offer_type', axis=1, inplace=True)

In [13]:
autos.ab_test.value_counts()

test       25756
control    24244
Name: ab_test, dtype: int64

The 'ab_test' column has 2 different values, each representing approximately 50% of the data. We will keep this column in our dataset as it may provide a useful insight in the future.

In [14]:
autos.gear_box.value_counts()

manuell      36993
automatik    10327
Name: gear_box, dtype: int64

The 'gear_box' column also contains 2 unique values, 'manual' and 'automatic'. These may be useful for our analysis later, so we will keep this column.

In [15]:
autos.unrepaired_damage.value_counts()

nein    35232
ja       4939
Name: unrepaired_damage, dtype: int64

The 'unrepaired_damage' column contains 2 unique values, 'yes' and 'no'. As this may have an impact on the price of a car, we will keep this in our final dataset.

Next, let's take care of the columns containing string data that need to be converted to numerical data.

In [16]:
autos.price.head()

0    $5,000
1    $8,500
2    $8,990
3    $4,350
4    $1,350
Name: price, dtype: object

In [17]:
autos.price = autos.price.str.replace('$','').str.replace(',','').astype(float)

In [18]:
autos.price.head()

0    5000.0
1    8500.0
2    8990.0
3    4350.0
4    1350.0
Name: price, dtype: float64

In [19]:
autos.odometer.head()

0    150,000km
1    150,000km
2     70,000km
3     70,000km
4    150,000km
Name: odometer, dtype: object

In [20]:
autos.odometer = autos.odometer.str.replace(',','').str.replace('km','').astype(int)

In [21]:
autos.rename({'odometer':'odometer_km'}, axis=1, inplace=True)

In [22]:
autos.describe(include='all')

Unnamed: 0,date_crawled,name,price,ab_test,vehicle_type,registration_year,gear_box,power_ps,model,odometer_km,registration_month,fuel_type,brand,unrepaired_damage,ad_created,num_pics,postal_code,last_seen
count,50000,50000,50000.0,50000,44905,50000.0,47320,50000.0,47242,50000.0,50000.0,45518,50000,40171,50000,50000.0,50000.0,50000
unique,48213,38754,,2,8,,2,,245,,,7,40,2,76,,,39481
top,2016-03-25 19:57:10,Ford_Fiesta,,test,limousine,,manuell,,golf,,,benzin,volkswagen,nein,2016-04-03 00:00:00,,,2016-04-07 06:17:27
freq,3,78,,25756,12859,,36993,,4024,,,30107,10687,35232,1946,,,8
mean,,,9840.044,,,2005.07328,,116.35592,,125732.7,5.72336,,,,,0.0,50813.6273,
std,,,481104.4,,,105.712813,,209.216627,,40042.211706,3.711984,,,,,0.0,25779.747957,
min,,,0.0,,,1000.0,,0.0,,5000.0,0.0,,,,,0.0,1067.0,
25%,,,1100.0,,,1999.0,,70.0,,125000.0,3.0,,,,,0.0,30451.0,
50%,,,2950.0,,,2003.0,,105.0,,150000.0,6.0,,,,,0.0,49577.0,
75%,,,7200.0,,,2008.0,,150.0,,150000.0,9.0,,,,,0.0,71540.0,


Alright, the price and odometer_km columns have been converted to numerical data. Now let's explore these columns further.

In [23]:
autos.price.describe()

count    5.000000e+04
mean     9.840044e+03
std      4.811044e+05
min      0.000000e+00
25%      1.100000e+03
50%      2.950000e+03
75%      7.200000e+03
max      1.000000e+08
Name: price, dtype: float64

At first glance, there are some irregularities with the price column. One being the minimum value is 0 and the maximum value is 100,000,000. Those seem like two extremes to me. Let's see how many times they occur in the series.

In [24]:
autos.price.value_counts().sort_index()

0.0           1421
1.0            156
2.0              3
3.0              1
5.0              2
              ... 
10000000.0       1
11111111.0       2
12345678.0       3
27322222.0       1
99999999.0       1
Name: price, Length: 2357, dtype: int64

In [25]:
autos.price.value_counts().sort_index().tail(10)

999990.0      1
999999.0      2
1234566.0     1
1300000.0     1
3890000.0     1
10000000.0    1
11111111.0    2
12345678.0    3
27322222.0    1
99999999.0    1
Name: price, dtype: int64

In [26]:
autos[autos['price'] == 99999999.0]

Unnamed: 0,date_crawled,name,price,ab_test,vehicle_type,registration_year,gear_box,power_ps,model,odometer_km,registration_month,fuel_type,brand,unrepaired_damage,ad_created,num_pics,postal_code,last_seen
39705,2016-03-22 14:58:27,Tausch_gegen_gleichwertiges,99999999.0,control,limousine,1999,automatik,224,s_klasse,150000,9,benzin,mercedes_benz,,2016-03-22 00:00:00,0,73525,2016-04-06 05:15:30


There are obvious outliers in this dataset. Let's remove any price lower than 100 dollars and greater than 500,000 dollars as start.

In [27]:
autos.price = autos.price[autos.price.between(100, 500000)]

In [28]:
autos.price.describe()

count     48224.000000
mean       5930.371433
std        9078.372762
min         100.000000
25%        1250.000000
50%        3000.000000
75%        7499.000000
max      350000.000000
Name: price, dtype: float64

I think that this is already starting to look a little bit better with the very low outliers and some of the larger outliers removed. Since the third quartile of the data is only 7,499.00 dollars, I think there is still some merit in removing more outliers on the top end of the spectrum.

In [29]:
autos.price = autos.price[autos.price.between(100, 200000)]

In [30]:
autos.price.describe()

count     48216.000000
mean       5884.005973
std        8316.716166
min         100.000000
25%        1250.000000
50%        3000.000000
75%        7499.000000
max      198000.000000
Name: price, dtype: float64

In [31]:
autos.price.value_counts().sort_index().tail(10)

163500.0    1
163991.0    1
169000.0    1
169999.0    1
175000.0    1
180000.0    1
190000.0    1
194000.0    1
197000.0    1
198000.0    1
Name: price, dtype: int64

In [32]:
autos[autos['price'] == 198000.0]

Unnamed: 0,date_crawled,name,price,ab_test,vehicle_type,registration_year,gear_box,power_ps,model,odometer_km,registration_month,fuel_type,brand,unrepaired_damage,ad_created,num_pics,postal_code,last_seen
40918,2016-03-20 18:40:05,Porsche_911_991_GT3_RS,198000.0,test,coupe,2015,automatik,500,911,5000,6,benzin,porsche,nein,2016-03-20 00:00:00,0,51491,2016-03-21 21:46:36


Our standard deviation is still very high, but I think that can be explained by the nature of car prices. For example, our current highest sale price is a Porsche 911 GT, which is a very expensive luxury car. It still may be interesting to analyze the information from these high end car sales, so we will keep the price column as is currently.

Next, let's check out the 'odometer_km' column a bit closer.

In [33]:
autos.odometer_km.describe()

count     50000.000000
mean     125732.700000
std       40042.211706
min        5000.000000
25%      125000.000000
50%      150000.000000
75%      150000.000000
max      150000.000000
Name: odometer_km, dtype: float64

In [34]:
autos.odometer_km.value_counts().sort_index()

5000        967
10000       264
20000       784
30000       789
40000       819
50000      1027
60000      1164
70000      1230
80000      1436
90000      1757
100000     2169
125000     5170
150000    32424
Name: odometer_km, dtype: int64

All of the information in the 'odometer_km' column looks relatively clean, although it is important to note that there is a significant skew towards 150,000 km. Over half of the entries in the dataset have an odometer reading of 150,000 km.

We'll now move onto cleaning the columns containing date information: 'date_crawled', 'ad_created', 'last_seen', 'registration_year', and 'registration_month'.

In [35]:
autos[['date_crawled', 'ad_created', 'last_seen', 'registration_year', 'registration_month']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 5 columns):
date_crawled          50000 non-null object
ad_created            50000 non-null object
last_seen             50000 non-null object
registration_year     50000 non-null int64
registration_month    50000 non-null int64
dtypes: int64(2), object(3)
memory usage: 1.9+ MB


It looks like the 'registration_year' and 'registration_month' columns are already in numerical format. We will extract date information from the other 3 columns.

In [36]:
autos[['date_crawled', 'ad_created', 'last_seen']].head()

Unnamed: 0,date_crawled,ad_created,last_seen
0,2016-03-26 17:47:46,2016-03-26 00:00:00,2016-04-06 06:45:54
1,2016-04-04 13:38:56,2016-04-04 00:00:00,2016-04-06 14:45:08
2,2016-03-26 18:57:24,2016-03-26 00:00:00,2016-04-06 20:15:37
3,2016-03-12 16:58:10,2016-03-12 00:00:00,2016-03-15 03:16:28
4,2016-04-01 14:38:50,2016-04-01 00:00:00,2016-04-01 14:38:50


In [37]:
autos['date_crawled'].str[:10].value_counts(normalize=True, dropna=False).sort_index()

2016-03-05    0.02538
2016-03-06    0.01394
2016-03-07    0.03596
2016-03-08    0.03330
2016-03-09    0.03322
2016-03-10    0.03212
2016-03-11    0.03248
2016-03-12    0.03678
2016-03-13    0.01556
2016-03-14    0.03662
2016-03-15    0.03398
2016-03-16    0.02950
2016-03-17    0.03152
2016-03-18    0.01306
2016-03-19    0.03490
2016-03-20    0.03782
2016-03-21    0.03752
2016-03-22    0.03294
2016-03-23    0.03238
2016-03-24    0.02910
2016-03-25    0.03174
2016-03-26    0.03248
2016-03-27    0.03104
2016-03-28    0.03484
2016-03-29    0.03418
2016-03-30    0.03362
2016-03-31    0.03192
2016-04-01    0.03380
2016-04-02    0.03540
2016-04-03    0.03868
2016-04-04    0.03652
2016-04-05    0.01310
2016-04-06    0.00318
2016-04-07    0.00142
Name: date_crawled, dtype: float64

Extracting the date for 'date_crawled' and creating a frequency distribution of the dates reveals that all of the data in our dataset were scraped between the dates of 3/5/2016 and 4/7/2016.

In [38]:
autos['ad_created'].str[:10].value_counts(normalize=True, dropna=False).sort_index()

2015-06-11    0.00002
2015-08-10    0.00002
2015-09-09    0.00002
2015-11-10    0.00002
2015-12-05    0.00002
               ...   
2016-04-03    0.03892
2016-04-04    0.03688
2016-04-05    0.01184
2016-04-06    0.00326
2016-04-07    0.00128
Name: ad_created, Length: 76, dtype: float64

The ads in the dataset were created between the dates 6/11/2015 and 4/7/2016, approxiamtely 8 months worth of ad data.

In [39]:
autos['ad_created'].str[:10].value_counts(normalize=True, dropna=False, sort=True)

2016-04-03    0.03892
2016-03-20    0.03786
2016-03-21    0.03772
2016-04-04    0.03688
2016-03-12    0.03662
               ...   
2016-02-07    0.00002
2016-02-11    0.00002
2016-01-03    0.00002
2016-02-01    0.00002
2015-12-05    0.00002
Name: ad_created, Length: 76, dtype: float64

Sorting the distribution by frequency reveals that the highest percentage of ads were created in mid-to-late March and early April 2016. A very small percentage of the ads in the dataset were created in January/February 2016.

In [40]:
autos['last_seen'].str[:10].value_counts(normalize=True, dropna=False).sort_index()

2016-03-05    0.00108
2016-03-06    0.00442
2016-03-07    0.00536
2016-03-08    0.00760
2016-03-09    0.00986
2016-03-10    0.01076
2016-03-11    0.01252
2016-03-12    0.02382
2016-03-13    0.00898
2016-03-14    0.01280
2016-03-15    0.01588
2016-03-16    0.01644
2016-03-17    0.02792
2016-03-18    0.00742
2016-03-19    0.01574
2016-03-20    0.02070
2016-03-21    0.02074
2016-03-22    0.02158
2016-03-23    0.01858
2016-03-24    0.01956
2016-03-25    0.01920
2016-03-26    0.01696
2016-03-27    0.01602
2016-03-28    0.02086
2016-03-29    0.02234
2016-03-30    0.02484
2016-03-31    0.02384
2016-04-01    0.02310
2016-04-02    0.02490
2016-04-03    0.02536
2016-04-04    0.02462
2016-04-05    0.12428
2016-04-06    0.22100
2016-04-07    0.13092
Name: last_seen, dtype: float64

The 'last_seen' column provides us the date that the crawler used to scrape the data last saw the listing online. It appears that these date ranges are the same as in the 'date_crawled' column. This tells us that the crawler was likely only active until between 3/5/2016 and 4/7/2016.

In [41]:
autos.registration_year.describe()

count    50000.000000
mean      2005.073280
std        105.712813
min       1000.000000
25%       1999.000000
50%       2003.000000
75%       2008.000000
max       9999.000000
Name: registration_year, dtype: float64

In [42]:
autos[autos['registration_year'] == 9999.0]

Unnamed: 0,date_crawled,name,price,ab_test,vehicle_type,registration_year,gear_box,power_ps,model,odometer_km,registration_month,fuel_type,brand,unrepaired_damage,ad_created,num_pics,postal_code,last_seen
8012,2016-03-23 16:43:29,Opel_GT_Karosserie_mit_Brief!,700.0,test,,9999,,0,andere,10000,0,,opel,,2016-03-23 00:00:00,0,21769,2016-04-05 20:16:15
14341,2016-03-23 01:36:20,Hole_kostenlos_ab,,test,,9999,,0,,10000,0,,bmw,,2016-03-23 00:00:00,0,32689,2016-03-23 08:47:00
33950,2016-03-23 21:52:25,58er_karmann_ghia_lowlight_Kaefer__zum_restaur...,7999.0,test,,9999,,0,kaefer,10000,0,,volkswagen,,2016-03-23 00:00:00,0,47638,2016-04-06 03:46:40
38076,2016-04-04 22:54:47,Mercedes_Benz_A180,18000.0,test,,9999,,0,a_klasse,10000,0,benzin,mercedes_benz,,2016-04-04 00:00:00,0,51379,2016-04-07 02:44:52


The registration years are looking a little bit suspect. There shouldn't be any registrations later than the date these ads were scraped (2016) and none earlier than 1877 (a very conservative parameter, as this was the year the first automobile was produced). Let's remove the rows that fall outside of these boundaries.

In [43]:
autos.registration_year = autos.registration_year[(autos.registration_year > 1877) & (autos.registration_year <= 2016)]

In [44]:
autos.registration_year.describe()

count    48028.00000
mean      2002.80351
std          7.31085
min       1910.00000
25%       1999.00000
50%       2003.00000
75%       2008.00000
max       2016.00000
Name: registration_year, dtype: float64

In [46]:
autos.registration_year.value_counts(normalize=True)

2000.0    0.069834
2005.0    0.062776
1999.0    0.062464
2004.0    0.056988
2003.0    0.056779
            ...   
1939.0    0.000021
1938.0    0.000021
1931.0    0.000021
1929.0    0.000021
1927.0    0.000021
Name: registration_year, Length: 78, dtype: float64

While exploring some of the columns that we've previously cleaned, it is evident that there are a significant number of entries with a missing price value. Given that this project revolves around car sale prices, I think it is pertinent to drop the rows with missing price values.

In [48]:
autos.dropna(axis=0, subset=['price'], inplace=True)

In [52]:
autos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48216 entries, 0 to 49999
Data columns (total 18 columns):
date_crawled          48216 non-null object
name                  48216 non-null object
price                 48216 non-null float64
ab_test               48216 non-null object
vehicle_type          43794 non-null object
registration_year     46344 non-null float64
gear_box              46012 non-null object
power_ps              48216 non-null int64
model                 45822 non-null object
odometer_km           48216 non-null int64
registration_month    48216 non-null int64
fuel_type             44338 non-null object
brand                 48216 non-null object
unrepaired_damage     39333 non-null object
ad_created            48216 non-null object
num_pics              48216 non-null int64
postal_code           48216 non-null int64
last_seen             48216 non-null object
dtypes: float64(2), int64(5), object(11)
memory usage: 7.0+ MB


We can now see that `autos.info()` reveals there are no longer any null values in the 'price' column.

In [53]:
autos

Unnamed: 0,date_crawled,name,price,ab_test,vehicle_type,registration_year,gear_box,power_ps,model,odometer_km,registration_month,fuel_type,brand,unrepaired_damage,ad_created,num_pics,postal_code,last_seen
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,5000.0,control,bus,2004.0,manuell,158,andere,150000,3,lpg,peugeot,nein,2016-03-26 00:00:00,0,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,8500.0,control,limousine,1997.0,automatik,286,7er,150000,6,benzin,bmw,nein,2016-04-04 00:00:00,0,71034,2016-04-06 14:45:08
2,2016-03-26 18:57:24,Volkswagen_Golf_1.6_United,8990.0,test,limousine,2009.0,manuell,102,golf,70000,7,benzin,volkswagen,nein,2016-03-26 00:00:00,0,35394,2016-04-06 20:15:37
3,2016-03-12 16:58:10,Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...,4350.0,control,kleinwagen,2007.0,automatik,71,fortwo,70000,6,benzin,smart,nein,2016-03-12 00:00:00,0,33729,2016-03-15 03:16:28
4,2016-04-01 14:38:50,Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...,1350.0,test,kombi,2003.0,manuell,0,focus,150000,7,benzin,ford,nein,2016-04-01 00:00:00,0,39218,2016-04-01 14:38:50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2016-03-27 14:38:19,Audi_Q5_3.0_TDI_qu._S_tr.__Navi__Panorama__Xenon,24900.0,control,limousine,2011.0,automatik,239,q5,100000,1,diesel,audi,nein,2016-03-27 00:00:00,0,82131,2016-04-01 13:47:40
49996,2016-03-28 10:50:25,Opel_Astra_F_Cabrio_Bertone_Edition___TÜV_neu+...,1980.0,control,cabrio,1996.0,manuell,75,astra,150000,5,benzin,opel,nein,2016-03-28 00:00:00,0,44807,2016-04-02 14:18:02
49997,2016-04-02 14:44:48,Fiat_500_C_1.2_Dualogic_Lounge,13200.0,test,cabrio,2014.0,automatik,69,500,5000,11,benzin,fiat,nein,2016-04-02 00:00:00,0,73430,2016-04-04 11:47:27
49998,2016-03-08 19:25:42,Audi_A3_2.0_TDI_Sportback_Ambition,22900.0,control,kombi,2013.0,manuell,150,a3,40000,11,diesel,audi,nein,2016-03-08 00:00:00,0,35683,2016-04-05 16:45:07


I think that the dataset is now clean enough to begin some exploratory analysis. We are going to export the clean dataset to a new csv file.

In [55]:
autos.to_csv('autos_clean.csv', index=False)