# Import the things

In [28]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

# Read the data

In [29]:
df_train = pd.read_csv('data/Train.csv')
df_test = pd.read_csv('data/test.csv')

# Preprocess & Feature Engineer

In [30]:
def preprocess(df):
    df['sale_year'] = pd.to_datetime(df['saledate']).apply(lambda x: x.year)
    df['age'] = df['sale_year'] - df['YearMade']

preprocess(df_train)
preprocess(df_test)

In [31]:
def preprocess2(df):
    df = pd.concat([df, pd.get_dummies(df['UsageBand'])], axis=1)
    return df

df_train = preprocess2(df_train)
df_test = preprocess2(df_test)

# Train a Random Forest on only a few good features

In [42]:
features = ['ModelID', 'YearMade', 'age'] #, 'MachineHoursCurrentMeter', 'datasource', 'Low', 'Medium', 'High']

X_train, y_train = df_train[features].fillna(-1), df_train['SalePrice']

model = RandomForestRegressor(50, n_jobs=2)
model.fit(X_train, y_train)

y_prediction = model.predict(df_test[features].fillna(-1))

# Output our answers

In [43]:
df_output = pd.DataFrame({'SalePrice': y_prediction}, index=df_test['SalesID'])
df_output.to_csv('answers.csv')

# Notes


|    features    | root mean squared log error |
|---------|--------|
| 'ModelID'    |      0.320080771128  |
| 'ModelID', 'YearMade'  |     0.313296967127   |
| 'ModelID', 'YearMade', 'age'    |      0.282919500352  |

more features seem to just add noise...

In [44]:
df_train['ModelID'].value_counts()

4605     5039
3538     4869
3170     4315
4604     4233
3362     4083
3537     3701
3171     3442
4603     3402
3357     3216
3178     3139
9550     2848
3112     2629
3854     2486
4579     2453
3542     2434
4147     2405
6788     2379
7277     2266
9551     2066
4146     2010
7110     1988
4124     1915
4123     1851
3877     1839
7057     1823
7008     1782
1169     1685
6633     1603
23931    1596
4991     1555
         ... 
12734       1
15547       1
21426       1
14115       1
28750       1
28019       1
16319       1
16561       1
25385       1
25013       1
15225       1
25458       1
22901       1
22830       1
24873       1
10528       1
22391       1
26550       1
23883       1
15994       1
36010       1
15548       1
14114       1
36266       1
3770        1
36460       1
28598       1
15907       1
25264       1
17288       1
Name: ModelID, dtype: int64