# Data Mining Project: Explorative Analysis of mobile.de car prices

Considering we thought about a project where we had to scrap data in order to create a model, one of our first choices was the creation of a model for second hand car prices. 

Second hand cars webpages contain a lot of different information for each one of the cars, so I would not be difficult to query that sites and extract all the meaninful information that we want. After a first analysis of several webpages was conducted, we stucked with mobile.de.

In [8]:
import sys
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import urllib2
from threading import Thread
from queue import Queue
import math
import requests
import time 
from datetime import timedelta
import pickle
import json

In [9]:
def extractValue(tag):
    if(len(tag) <= 0):
        return np.nan
    return tag[0].stripped_strings.next()

In [10]:
def extractInformation(resultID):
    success = False    
    while not success:
        try:
            url = urllib2.urlopen("http://suchen.mobile.de/fahrzeuge/details.html?id=" + str(resultID))
            url = url.read()
            success = True
        except urllib2.HTTPError as e:
            # car not available any more
            return None
        except Exception as e:
            # retry
            success = False
            
    soup = BeautifulSoup(url, "lxml")
    
    # ID does not exist (any longer)
    if len(soup.find_all("div", class_="cBox-body cBox-body--notification-error")) > 0:
        return None
    
    scripts = soup.find_all("script")
    for script in scripts:
        if script.text.startswith("var partnerUrl"):
            text = script.text
            
            text = text[text.find(", make: \"")+9:len(text)]
            make = text[0:text.find("\"")]
            text = text[text.find(", model: '")+10:len(text)]
            model = text[0:text.find("'")]
    
    mileage = extractValue(soup.select("#rbt-mileage-v"))
    power = extractValue(soup.select("#rbt-power-v"))
    fuel = extractValue(soup.select("#rbt-fuel-v"))
    transmission = extractValue(soup.select("#rbt-transmission-v"))
    firstRegistration = extractValue(soup.select("#rbt-firstRegistration-v"))
    damageCondition = extractValue(soup.select("#rbt-damageCondition-v"))
    numSeats = extractValue(soup.select("#rbt-numSeats-v"))
    doorCount = extractValue(soup.select("#rbt-doorCount-v"))
    climatisation = extractValue(soup.select("#rbt-climatisation-v"))
    airbags = extractValue(soup.select("#rbt-airbag-v"))
    color = extractValue(soup.select("#rbt-color-v"))
    interior = extractValue(soup.select("#rbt-interior-v"))
    parkAssist = extractValue(soup.select("#rbt-parkAssists-v"))
    
    price = extractValue(soup.select("span.h3.rbt-prime-price"))
    
    bulletPoints = []
    for bulletPoint in soup.find_all("div", class_="bullet-list"):
        tag = bulletPoint.select("p")
        if len(tag) > 0:
            bulletPoints.append(extractValue(tag))
    
    return [resultID, 
            make,
            model,
            mileage, 
            power, 
            fuel, 
            transmission, 
            firstRegistration, 
            damageCondition, 
            numSeats, 
            doorCount, 
            climatisation, 
            airbags, 
            color, 
            interior, 
            parkAssist,
            price, 
            bulletPoints]

In [11]:
def buildURL(baseURL):
    url = "?isSearchRequest=true&vc=Car&dam=0&con=USED&ambitCountry=DE"
    # explicitly specify categories to prevent trailers etc. from showing up
    url += "&categories=Cabrio"
    url += "&categories=OffRoad"
    url += "&categories=SmallCar"
    url += "&categories=EstateCar"
    url += "&categories=Limousine"
    url += "&categories=SportsCar"
    url += "&categories=Van"   
    
    return baseURL + url

def buildURLParameters(mileageFrom = -1,
                    mileageTo = -1,
                    firstRegistrationFrom = -1,
                    firstRegistrationTo = -1,
                    priceFrom = -1,
                    priceTo = -1,
                    powerFrom = -1,
                    powerTo = -1):
        
    parameters = ""
    if(mileageFrom >= 0):
        parameters += "&minMileage=" + repr(mileageFrom)
    if(mileageTo >= 0):
        parameters += "&maxMileage=" + repr(mileageTo)
    if(firstRegistrationFrom >= 0):
        parameters += "&minFirstRegistrationDate=" + repr(firstRegistrationFrom) + "-01-01"
    if(firstRegistrationTo >= 0):
        parameters += "&maxFirstRegistrationDate=" + repr(firstRegistrationTo) + "-12-31"
    if(priceFrom >= 0):
        parameters += "&minPrice=" + repr(priceFrom)
    if(priceTo >= 0):
        parameters += "&maxPrice=" + repr(priceTo)
        
    return parameters

In [12]:
def getResultCount(url):
    nResults = -1
    while nResults == -1:
        try:
            response = requests.get(url)
            json = response.json()
            nResults = json['numResultsTotal']
        except ConnectionError as e:
            print "\nConnection Error for query " + url + ", retrying."
            nResults = -1
    return nResults

In [None]:
relevantIDs = set()

def scrapeResultList(baseURL, nResults):
    pageCount = int(math.ceil(nResults / 20.0))
    for i in range(pageCount):
        url = urllib.urlopen(baseURL + "&pageNumber=" + repr(i + 1))
        url = url.read()
        soup = BeautifulSoup(url, "lxml")
        div_results = soup.find_all("div", class_="cBox-body cBox-body--resultitem")

        for div_result in div_results:
            relevantIDs.add(div_result.a["data-ad-id"])

class Worker(Thread):
    def __init__(self, queue):
        Thread.__init__(self)
        self.queue = queue
    def run(self):
        while True:
            try:
                baseURL, nResults = self.queue.get()
                scrapeResultList(baseURL, nResults)
                sys.stdout.write("\r#IDs: " + repr(len(relevantIDs)) +" / #Queue: " + repr(self.queue.qsize()))
            except Exception as e:
                print(e)
            finally:
                self.queue.task_done()

queue = Queue(10)
for x in range(2):
    worker = Worker(queue)
    worker.daemon = True
    worker.start()

base_url_search = buildURL("http://suchen.mobile.de/fahrzeuge/auto")
base_url_json = buildURL("http://suchen.mobile.de/fahrzeuge/count.json")
    
for mileage in range(1, 1501): # mileage between 0 and 1500000, interval 1000
    parametersMileage = buildURLParameters(mileageFrom=(mileage-1)*1000, mileageTo=mileage*1000)
    nResultsMileage = getResultCount(base_url_json + parametersMileage)
    
    if len(relevantIDs) > 10000:
        break;
    
    if nResultsMileage <= 1000:
        queue.put((base_url_search + parametersMileage, nResultsMileage))
    else:
        for firstRegistration in range(1900, 2017): # first registration between 1900 and 2016, interval 1
            parametersRegistration = buildURLParameters(firstRegistrationFrom=firstRegistration, firstRegistrationTo=firstRegistration)
            nResultsRegistration = getResultCount(base_url_json + parametersMileage + parametersRegistration)

            if len(relevantIDs) > 10000:
                break;
                
            if nResultsRegistration <= 1000:
                queue.put((base_url_search + parametersMileage + parametersRegistration, nResultsRegistration))
            else:
                for price in range(1, 1001): # price between 0 and 100000, interval 100
                    #print "price " + repr((price-1)*100) + " - " + repr(price*100)
                    parametersPrice = buildURLParameters(priceFrom=(price-1)*100, priceTo=price*100)
                    nResultsPrice = getResultCount(base_url_json + parametersMileage + parametersRegistration + parametersPrice)
                    
                    if len(relevantIDs) > 10000:
                        break;
                        
                    if nResultsPrice <= 1000:
                        queue.put((base_url_search + parametersMileage + parametersRegistration + parametersPrice, nResultsPrice))
                    else:
                        for priceFine in range((price-1)*100, price*100, 10):
                            parametersPriceFine = buildURLParameters(priceFrom=priceFine, priceTo=priceFine+10)
                            nResultsPriceFine = getResultCount(base_url_json + parametersMileage + parametersRegistration + parametersPriceFine)
                    
                            if len(relevantIDs) > 10000:
                                break;
                                
                            queue.put((base_url_search + parametersMileage + parametersRegistration + parametersPriceFine, nResultsPriceFine))
                    
                            if nResultsPriceFine > 1000:
                                print "\nDROPPING " + repr(nResultsPrice - 1000) + " ELEMENTS"
        
queue.join()

#IDs: 0 / #Queue: 0global name 'urllib' is not defined
#IDs: 0 / #Queue: 0global name 'urllib' is not defined
#IDs: 0 / #Queue: 0global name 'urllib' is not defined
#IDs: 0 / #Queue: 0global name 'urllib' is not defined
#IDs: 0 / #Queue: 0global name 'urllib' is not defined
#IDs: 0 / #Queue: 0global name 'urllib' is not defined
#IDs: 0 / #Queue: 0global name 'urllib' is not defined
global name 'urllib' is not defined
global name 'urllib' is not defined
global name 'urllib' is not defined
global name 'urllib' is not defined
global name 'urllib' is not defined
global name 'urllib' is not defined
global name 'urllib' is not defined
global name 'urllib' is not defined
global name 'urllib' is not defined
global name 'urllib' is not defined
global name 'urllib' is not defined
global name 'urllib' is not defined
global name 'urllib' is not defined
global name 'urllib' is not defined
#IDs: 0 / #Queue: 0global name 'urllib' is not defined
global name 'urllib' is not defined
global name 'urllib'

In [None]:
print repr(len(relevantIDs))

In [None]:
relevantIDsList = list(relevantIDs) # just for output
relevantIDCount = len(relevantIDsList)
testData = []

class ResultScraper(Thread):
    def __init__(self, queue):
        Thread.__init__(self)
        self.queue = queue
    def run(self):
        while True:
            try:
                resultID = self.queue.get()
                result = extractInformation(resultID)
                if result is not None:
                    testData.append(result)
            except Exception as e:
                print(e)
            finally:
                self.queue.task_done()
                
taskQueue = Queue(500)
for x in range(2):
    worker = ResultScraper(taskQueue)
    worker.daemon = True
    worker.start()

resultScrapingStartTime = time.time()
    
for i in range(len(relevantIDsList)):
    sys.stdout.write("\rQueueing " + relevantIDsList[i] + 
                     " (" + repr((i+1)) + 
                     " / " + repr(relevantIDCount) + 
                     ", " + repr(((i + 1.0) / relevantIDCount) * 100) + "%)")
    taskQueue.put(relevantIDsList[i])
    
taskQueue.join()
resultScrapingEndTime = time.time()

In [None]:
dataFile = open('testData.pckl', 'wb')
pickle.dump(testData, dataFile)

In [None]:
print testData[1000]

In [None]:
testDataTMP = []
for i in range(len(testData)):
    testDataTMP.append(testData[i][:18])

In [None]:
print testDataTMP[0][17][0]

In [None]:
testDF = pd.DataFrame(data=testDataTMP, columns=["CarID", 
                                              "Brand",
                                              "Model",
                                              "Mileage", 
                                              "Power", 
                                              "Fuel", 
                                              "Transmission", 
                                              "Registration",
                                              "Damage",
                                              "Seats",
                                              "Doors",
                                              "Climatisation",
                                              "Airbags",
                                              "Color",
                                              "Interior",
                                              "ParkAssist",
                                              "Price"])
testDF.head()

In [None]:
testDF.describe()

In [None]:
dfFinal = pd.DataFrame(columns=["CarID", "Mileage", "Power","Brand", "Model", "Fuel", "Transmission", "Registration", "Price"])


In [None]:
dfFinal['Mileage'] = testDF['Mileage'].str.split('km').str.get(0).astype(float)
dfFinal['Price'] = testDF['Price'].str.split().str.get(0).replace('.', '')
dfFinal['Price'] = dfFinal['Price'].str.replace('.', '').astype(float)
dfFinal['Power'] = testDF['Power'].str.split().str.get(0).astype(float)
dfFinal['Fuel'] = testDF['Fuel']
dfFinal['Transmission'] = testDF['Transmission']
dfFinal['Registration'] = testDF['Registration']
dfFinal['CarID'] = testDF['CarID']
dfFinal['Brand'] = testDF['Brand']
dfFinal['Model'] = testDF['Model']

In [None]:
dfFinal.info(verbose=True)

## Data cleaning and transformation

For the purposes of the study this data is not completely accurate and it should be cleaned and treated for the algorithms to be more accurate. For instance, some of the variables have to be converted to numeric, categories have to be properly aggregated and NaNs should be treated.

In [None]:
dfFinal.head()

In [None]:
dfFinal['Year'] = dfFinal['Registration'].str.extract('(\d\d\d\d)', expand=True)

In [None]:
dfFinal.head()

In [None]:
dfFinal.corr()

We can see Power is the column with the highest correlation with Price. As a preliminary analysis let's see the scatterplot of these two variables:

In [None]:
dfFinal.plot(kind='scatter', x='Power', y='Price')

### Boxplots

Since most of the data is qualitative, boxplots showing the distribution of some of the most important categories can be shown to be if there are strong differences between them. (For visualization purposes the outliers were taken away)

In [None]:
import seaborn as sb

In [None]:
dfFinal['Decade'] = dfFinal['Year'].str.extract('(\d\d\d)', expand=True) + "0"

In [None]:
plot = sb.boxplot(x=dfFinal['Decade'], y='Price', data=dfFinal, showfliers=False) 
plot.set_title('Distribution of Prices by decade')

Here we can see the prices decrease in average over time, but the most important feature is that the variability of prices decreases the youngest the cars are.

In [None]:
dfFinal['FuelStd'] = dfFinal['Fuel'].str.split(',').str.get(0)

In [None]:
dfFinal.FuelStd.unique()

In [None]:
plot = sb.boxplot(orient='h', y=dfFinal['FuelStd'], x='Price', data=dfFinal, showfliers=False)
plot.set_title('Distribution of Prices by Fuel')

After aggregating the fuel types into more general categories we can see that, for example, diesel cars are more expensive in average than gasoline cars, and both are cheaper than hybrid cars.

In [None]:
plot = sb.boxplot(x=dfFinal['Transmission'], y='Price', data=dfFinal, showfliers=False) 
plot.set_title('Distribution of Prices by Transmission')

In [None]:
plot = sb.boxplot(orient='h', y=dfFinal['Brand'], x='Price', data=dfFinal, showfliers=False)
plot.set_title('Distribution of Prices by Brand')

In [None]:
X = dfFinal[['Mileage','Power']].copy()
X['Ones'] = np.ones(len(dfFinal))
y = dfFinal.Price

In [1212]:
y = y.values
X = X.values

In [1213]:
# Function definition is here
def adagrad( X, y, iterations='default' ):
    b = np.zeros(3)
    n=len(y)
    if iterations == 'default':
        size = 10*len(y)
    else: 
        size = iterations
    G = 0
    for j in range(size):
        i = random.randint(0,n-1)
        grad = (y[i]-b.dot(X[i]))*-X[i]
        G = G + grad**2
        b = b - (grad/(G**0.5))
        print grad
    #printing the r2 etc
    rss =  sum((y[i]-b.dot(X[i]))**2 for i in range(n))
    tss = sum((y[i]-y.mean())**2 for i in range(n))
    r_squared = 1.-rss/tss
    mse = rss/n
    print mse, r_squared, b
    return b;

In [1214]:
adagrad(X, y,2)

[ -449750. -2122820.   -17990.]
[ -208395. -1125333.   -13893.]
nan nan [ 1.42  1.47  1.61]


array([ 1.42,  1.47,  1.61])

In [1215]:
from sklearn.linear_model import LinearRegression

In [1216]:
Xnew = dfFinal[['Power']].copy()
y = dfFinal.Price
y=y.values
Xnew['Ones1'] = np.ones(len(dfFinal))
Xnew['Ones2'] = np.ones(len(dfFinal))
Xnew=Xnew.values
prediction=adagrad(Xnew, y)
prediction=Xnew*prediction
prediction=np.sum(prediction,axis=1)

[-47362000.    -99500.    -99500.]
[-1938800.   -19388.   -19388.]
[-659661.11  -12934.53  -12934.53]
[-588621.83  -10900.4   -10900.4 ]
[-608484.54  -13829.19  -13829.19]
[-1847561.41   -22809.4    -22809.4 ]
[-938568.8   -13408.13  -13408.13]
[-2315532.74   -23155.33   -23155.33]
[-2148937.27   -20863.47   -20863.47]
[-1012257.01   -15816.52   -15816.52]
[-4220486.67   -28710.79   -28710.79]
[-2585039.94   -25850.4    -25850.4 ]
[-3342017.8    -24755.69   -24755.69]
[-444619.08   -8718.02   -8718.02]
[-460666.44   -9401.36   -9401.36]
[-3588576.     -34840.54   -34840.54]
[-870540.89  -13190.01  -13190.01]
[-1073284.72   -17888.08   -17888.08]
[-2428499.28   -23577.66   -23577.66]
[-7179316.72  -188929.39  -188929.39]
[-1700358.07   -19322.25   -19322.25]
[-14296024.72   -324909.65   -324909.65]
[-1982842.27   -19250.9    -19250.9 ]
[-1215035.84   -14294.54   -14294.54]
[-709156.94  -10744.8   -10744.8 ]
[-1448348.86   -18809.73   -18809.73]
[-801310.3   -13355.17  -13355.17]
[-65248

ValueError: I/O operation on closed file

In [None]:
Xnew = dfFinal[['Power']].copy()
Xnew['Price_Prediction'] = prediction
Xnew['Price'] = dfFinal.Price
Xnew.head()

In [None]:
#--------------------------Boris-Experiment------------------------

In [1217]:
Xnew = dfFinal[['Power']].copy()
y = dfFinal.Price
y=y.values
Xnew['Ones1'] = np.ones(len(dfFinal))
Xnew['Ones2'] = np.ones(len(dfFinal))
Xnew=Xnew.values
#reg = LinearRegression()
#reg.fit(Xnew,y)
#reg.score(Xnew,y)
#prediction=reg.predict(Xnew)





In [1218]:
i = 0;
for j in range(len(Xnew)):
    if(math.isnan(Xnew[j][0])):
        i=i+1
print i

157


In [1219]:
dfCat = dfFinal.copy()
dfCat.head()

Unnamed: 0,CarID,Brand,Model,Mileage,Power,Fuel,Transmission,Registration,Damage,Seats,...,Bluetooth,Schiebedach,Anhängerkupplung,Freisprecheinrichtung,Sitzheizung,Scheckheftgepflegt,Sportsitze,Year,Decade,FuelStd
0,217996155,Peugeot,208,50.0,84.0,Diesel,Schaltgetriebe,07/2014,Unfallfrei,5.0,...,1,0,0,1,1,1,0,2014,2010,Diesel
1,201829863,Ford,B-Max,0.0,70.0,Diesel,Schaltgetriebe,12/2015,,,...,0,0,0,0,1,0,0,2015,2010,Diesel
2,233508386,Kia,Rio,6.0,80.0,"Benzin, E10-geeignet",Schaltgetriebe,06/2015,Unfallfrei,5.0,...,1,0,0,0,1,1,0,2015,2010,Benzin
3,217939783,Fiat,500,400.0,25.0,Benzin,Schaltgetriebe,07/1971,,,...,0,0,0,0,0,0,0,1971,1970,Benzin
4,231619930,Jaguar,E-Type,635.0,200.0,Benzin,Schaltgetriebe,07/1974,,,...,0,0,0,0,0,0,0,1974,1970,Benzin


In [1220]:
dfCat = dfCat.drop('CarID', 1)
dfCat = dfCat.drop('Registration', 1)
#dfCat = dfCat.drop('Year', 1)
dfCat = dfCat.drop('Fuel', 1)
dfCat = dfCat.drop('Decade', 1)

In [1221]:
yearnummer=dfCat['Year']
pd.to_numeric(yearnummer)
#dfCat = dfCat.drop('Year', 1)

0      2014.00
1      2015.00
2      2015.00
3      1971.00
4      1974.00
5      2015.00
6      2014.00
7      2015.00
8      2015.00
9      2015.00
10     2015.00
11     1935.00
12     2015.00
13     2015.00
14     2015.00
15     2015.00
16     2015.00
17     2015.00
18     1966.00
19     2015.00
20     2015.00
21     2015.00
22     2015.00
23     2015.00
24     2015.00
25     2015.00
26     2015.00
27     2015.00
28     2015.00
29     2015.00
         ...  
9769   2014.00
9770   2014.00
9771   2015.00
9772   2013.00
9773   2015.00
9774   2015.00
9775   2014.00
9776   2015.00
9777   2014.00
9778   2010.00
9779   2015.00
9780   1997.00
9781   2014.00
9782   2014.00
9783   2015.00
9784   2015.00
9785   2015.00
9786   2015.00
9787   2015.00
9788   2015.00
9789   2014.00
9790   2015.00
9791   2015.00
9792   2015.00
9793   1991.00
9794   2015.00
9795   2015.00
9796   2015.00
9797   2015.00
9798   2014.00
Name: Year, dtype: float64

### NaN operations

Now that the data is transformed and gathered, we have to deal with NaN values. there are two cases:

-Categorical data, where the NaN are transformed into a String ("missing") so they can be treated as a new category in each column.

-Cuantitative data, where the values will be averaged using the data that we already have.

In [1222]:
dfClean = dfCat.copy()
dfClean['Damage'] = dfClean['Damage'].fillna('missing')
dfClean['ParkAssist'] = dfClean['ParkAssist'].fillna('missing')
dfClean.ix[:,4:12] = dfClean.ix[:,4:12].fillna('missing')
dfClean['FuelStd'] = dfClean['FuelStd'].fillna('missing')
dfClean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9799 entries, 0 to 9798
Data columns (total 64 columns):
Brand                             9799 non-null object
Model                             9799 non-null object
Mileage                           9799 non-null float64
Power                             9642 non-null float64
Transmission                      9799 non-null object
Damage                            9799 non-null object
Seats                             9799 non-null object
Doors                             9799 non-null object
Climatisation                     9799 non-null object
Airbags                           9799 non-null object
Color                             9799 non-null object
Interior                          9799 non-null object
ParkAssist                        9799 non-null object
Price                             9799 non-null float64
Leichtmetallfelgen                9799 non-null int64
Elektr. Sitzeinstellung           9799 non-null int64
Tagfahrlicht

In [1223]:
#dfDrop = dfCat.dropna(axis=0)

#dfCat.Transmission[pd.isnull(dfCat.Transmission)]  = 'NaN'

#from sklearn import preprocessing
#import numpy as np
#le = preprocessing.LabelEncoder()

#le.fit(dfCat.Transmission)

#list(le.classes_)

#dfCat.Transmission = dfCat.Transmission.apply(le.transform)

In [1224]:
from sklearn.decomposition import *
def repair_mad(df, n_comp = 4, n_iter = 5, n_remove = None, n_rep = 1):
    num_feat = df.shape[1]
    size = df.shape[0]
    mads = [0.]*num_feat
    cnt = [0]*num_feat
    np.random.seed(191)
    if not n_remove:
        n_rep = 1
    for p in range(n_rep):
        df_prep = df.copy().astype(float)
        if n_remove:
            removed = []
            for i in range(n_remove):
                i = np.random.randint(0,size)
                j = np.random.randint(0,num_feat)
                val = df.iat[i,j]
                df_prep.iat[i,j] = np.nan
                removed.append([i,j,val])
        df_train = df_prep.fillna(df_prep.mean())
        # run PCA and reconstruct data set
        for i in range(n_iter):
            pca = PCA(n_components = n_comp).fit(df_train)
            df_pred = pca.inverse_transform(pca.transform(df_train))
            df_pred = pd.DataFrame(df_pred,columns=df.columns,index=df.index)
            df_train = df_prep.combine_first(df_pred)
        if n_remove:
            for pos in removed:
                diff = pos[2] - df_train.iat[pos[0],pos[1]]
                if not np.isnan(diff):
                    cnt[pos[1]] += 1
                    mads[pos[1]] += 1./cnt[pos[1]]*(abs(diff)-mads[pos[1]])
    if n_remove:
        print pd.DataFrame([mads],index=['MAD'],columns=df_train.columns)
    return df_train

In [1225]:
from sklearn.tree import *
from sklearn import cross_validation
from sklearn.tree import *
from sklearn.ensemble import *

In [1226]:
processedData=pd.get_dummies(dfClean)
#processedData=pd.get_dummies(dfCat)
processedData['Year']=yearnummer
repaired=repair_mad(processedData,n_iter=20,n_comp = 10,n_rep = 5)

Xnew = repaired.copy()
Xnew = Xnew.drop('Price', 1)
y = repaired.Price
y=y.values

Xnew['Ones1'] = np.ones(len(repaired))
Xnew['Ones2'] = np.ones(len(repaired))
Xnew=Xnew.values

In [1227]:
repaired.head()

Unnamed: 0,Mileage,Power,Price,Leichtmetallfelgen,Elektr. Sitzeinstellung,Tagfahrlicht,Xenonscheinwerfer,Start/Stopp-Automatik,Sportpaket,Behindertengerecht,...,FuelStd_Andere,FuelStd_Autogas (LPG),FuelStd_Benzin,FuelStd_Diesel,FuelStd_Elektro,FuelStd_Erdgas (CNG),FuelStd_Hybrid (Benzin/Elektro),FuelStd_Hybrid (Diesel / Elektro),FuelStd_missing,Year
0,50.0,84.0,14990.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2014.0
1,0.0,70.0,15785.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2015.0
2,6.0,80.0,13990.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2015.0
3,400.0,25.0,15900.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1971.0
4,635.0,200.0,128500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1974.0


In [1228]:
repaired.corr()

Unnamed: 0,Mileage,Power,Price,Leichtmetallfelgen,Elektr. Sitzeinstellung,Tagfahrlicht,Xenonscheinwerfer,Start/Stopp-Automatik,Sportpaket,Behindertengerecht,...,FuelStd_Andere,FuelStd_Autogas (LPG),FuelStd_Benzin,FuelStd_Diesel,FuelStd_Elektro,FuelStd_Erdgas (CNG),FuelStd_Hybrid (Benzin/Elektro),FuelStd_Hybrid (Diesel / Elektro),FuelStd_missing,Year
Mileage,1.00,0.12,0.07,-0.02,0.03,-0.07,0.01,-0.04,0.06,0.03,...,-0.01,0.02,0.01,-0.03,0.08,0.02,0.00,0.00,0.02,-0.11
Power,0.12,1.00,0.36,0.12,0.25,-0.15,0.26,0.01,0.19,0.00,...,0.08,-0.00,-0.08,0.07,-0.05,-0.02,0.02,0.01,0.05,-0.20
Price,0.07,0.36,1.00,-0.07,0.05,-0.18,0.05,-0.08,0.06,0.00,...,0.01,-0.00,0.02,-0.04,-0.01,-0.01,-0.00,-0.00,0.13,-0.35
Leichtmetallfelgen,-0.02,0.12,-0.07,1.00,0.12,0.25,0.23,0.25,0.14,-0.01,...,-0.02,0.00,-0.11,0.13,-0.01,0.02,0.04,0.01,-0.11,0.31
Elektr. Sitzeinstellung,0.03,0.25,0.05,0.12,1.00,0.05,0.19,0.01,0.04,-0.01,...,-0.00,0.02,-0.09,0.09,-0.01,-0.01,0.06,-0.00,-0.00,0.04
Tagfahrlicht,-0.07,-0.15,-0.18,0.25,0.05,1.00,0.07,0.25,0.03,-0.01,...,-0.03,0.01,-0.03,0.06,-0.05,0.02,-0.01,0.01,-0.14,0.45
Xenonscheinwerfer,0.01,0.26,0.05,0.23,0.19,0.07,1.00,0.09,0.14,0.02,...,-0.01,0.01,-0.14,0.15,-0.03,-0.01,0.01,-0.00,-0.03,0.09
Start/Stopp-Automatik,-0.04,0.01,-0.08,0.25,0.01,0.25,0.09,1.00,0.02,-0.01,...,-0.02,-0.00,-0.15,0.18,-0.05,0.03,-0.02,0.01,-0.07,0.24
Sportpaket,0.06,0.19,0.06,0.14,0.04,0.03,0.14,0.02,1.00,-0.01,...,0.02,-0.00,0.03,-0.03,-0.02,-0.01,-0.02,-0.00,-0.00,-0.01
Behindertengerecht,0.03,0.00,0.00,-0.01,-0.01,-0.01,0.02,-0.01,-0.01,1.00,...,-0.00,-0.00,-0.00,0.00,-0.00,-0.00,-0.00,-0.00,-0.00,-0.00


In [1229]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(Xnew, y, random_state=20)

reg=DecisionTreeRegressor(max_depth=8)
reg.fit(X_train,y_train)
reg.score(X_test,y_test)
#prediction=reg.predict(Xnew)

0.20007311248565862

In [1230]:
reg = RandomForestRegressor(n_estimators=25, oob_score=True)
reg.fit(Xnew,y)
reg.score(Xnew,y),reg.oob_score_

(0.8995975611746051, 0.34765048949132571)

In [1231]:
reg=LinearRegression()
reg.fit(Xnew,y)
reg.score(Xnew,y)

0.76492988571099962

In [None]:
oobScores=[]
for i in range(1,100):
    reg = RandomForestRegressor(n_estimators=i, oob_score=True)
    reg.fit(Xnew,y)
    arrayToAppend=[]
    arrayToAppend.append(reg.oob_score_)
    arrayToAppend.append(i)
    oobScores.append(reg.oob_score_)
    print i
pd.DataFrame([oobScores],index=['original']).transpose().plot(figsize=(16,4))

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22


NameError: name 'repaired' is not defined