In [1]:
import pandas as pd 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [2]:
test_df = pd.read_csv('test.csv')
train_df = pd.read_csv('train.csv')

In [3]:
print(train_df.describe())

        dateCrawled          name        seller  offerType         price  \
count  70000.000000  70000.000000  70000.000000    70000.0  70000.000000   
mean   37664.727486  25533.298743      1.999986        1.0   4388.069929   
std    21759.727731  14884.198247      0.003780        0.0   3469.881904   
min        1.000000      2.000000      1.000000        1.0    400.000000   
25%    18771.750000  12543.000000      2.000000        1.0   1500.000000   
50%    37701.500000  25660.500000      2.000000        1.0   3300.000000   
75%    56551.250000  38846.250000      2.000000        1.0   6500.000000   
max    75331.000000  51408.000000      2.000000        1.0  13900.000000   

             abtest   vehicleType  yearOfRegistration       gearbox  \
count  70000.000000  70000.000000        70000.000000  70000.000000   
mean       1.519700      5.409314         2002.662029      1.809514   
std        0.499615      1.674844            5.809177      0.392687   
min        1.000000      1.0000

In [4]:
print(test_df.describe())

        dateCrawled          name   seller  offerType       abtest  \
count  10000.000000  10000.000000  10000.0    10000.0  10000.00000   
mean   37766.438200  25532.947500      2.0        1.0      1.51910   
std    21697.675399  14969.911908      0.0        0.0      0.49966   
min        6.000000      1.000000      2.0        1.0      1.00000   
25%    19186.250000  12572.500000      2.0        1.0      1.00000   
50%    37721.000000  25706.500000      2.0        1.0      2.00000   
75%    56275.750000  39171.000000      2.0        1.0      2.00000   
max    75324.000000  51404.000000      2.0        1.0      2.00000   

        vehicleType  yearOfRegistration       gearbox       powerPS  \
count  10000.000000        10000.000000  10000.000000  10000.000000   
mean       5.389200         2002.684800      1.812900    115.489000   
std        1.667154            5.773472      0.390011    101.716352   
min        1.000000         1960.000000      1.000000      0.000000   
25%        5.0

In [5]:
print(test_df.columns)
print(train_df.columns)

Index(['dateCrawled', 'name', 'seller', 'offerType', 'abtest', 'vehicleType',
       'yearOfRegistration', 'gearbox', 'powerPS', 'model', 'kilometer',
       'monthOfRegistration', 'fuelType', 'brand', 'notRepairedDamage',
       'dateCreated', 'nrOfPictures', 'postalCode', 'lastSeen', 'Id'],
      dtype='object')
Index(['dateCrawled', 'name', 'seller', 'offerType', 'price', 'abtest',
       'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model',
       'kilometer', 'monthOfRegistration', 'fuelType', 'brand',
       'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode',
       'lastSeen'],
      dtype='object')


In [6]:
x = train_df.drop(columns='price')
y = train_df.price 

#defining a function that trains and fits a Decision Tree with different max nodes and returns the mae for each 
def mae(max_nodes, Xtrain, Xtest, Ytrain, Ytest):
    model = DecisionTreeRegressor(max_leaf_nodes = max_nodes, random_state = 1)
    model.fit(Xtrain,Ytrain)
    prediction_vals = model.predict(Xtest)
    mean_abs_error = mean_absolute_error(Ytest, prediction_vals)
    return mean_abs_error 


#splitting the training data 
Xtrain, Xtest, Ytrain, Ytest = train_test_split(x,y,random_state=1)

In [7]:
for nodes in [25,50,100,500,1000,2000,5000]:
    result = mae(nodes, Xtrain, Xtest, Ytrain, Ytest)
    print("Max Leaf Nodes: %d  \t\t Mean Absolute Error: %d" 
          %(nodes,result))


Max Leaf Nodes: 25  		 Mean Absolute Error: 1499
Max Leaf Nodes: 50  		 Mean Absolute Error: 1423
Max Leaf Nodes: 100  		 Mean Absolute Error: 1357
Max Leaf Nodes: 500  		 Mean Absolute Error: 1263
Max Leaf Nodes: 1000  		 Mean Absolute Error: 1253
Max Leaf Nodes: 2000  		 Mean Absolute Error: 1265
Max Leaf Nodes: 5000  		 Mean Absolute Error: 1328


In [8]:
#Runnning the Random Forest Model
Xtrain, Xtest, Ytrain, Ytest = train_test_split(x,y,random_state=1)

randomForest = RandomForestRegressor(random_state=1)
randomForest.fit(Xtrain,Ytrain)
prediction = randomForest.predict(Xtest)
randForest_mae = mean_absolute_error(Ytest,prediction)

print("Mean Absolute Error for Random Forest: {}".format(randForest_mae))

Mean Absolute Error for Random Forest: 1058.949572


MAE is lowest for Random Forest Model Compared to Decision Tree, so for final output we will use the RandomForest

In [9]:
##Running Random Forest with entire train data set for final output
#train_df, test_df

y_final = train_df.price
x_final = train_df.drop(columns = 'price')
ids_stored = test_df.Id
x_test = test_df.drop(columns = "Id")

RandomForestModel = RandomForestRegressor(random_state=1)
RandomForestModel.fit(x_final, y_final)

RandomForestRegressor(random_state=1)

In [10]:
predicted_output = RandomForestModel.predict(x_test)
predicted_output

array([2039.02, 2501.19, 7796.17, ..., 2540.86, 2687.18, 3396.51])

In [11]:
output = pd.DataFrame(predicted_output, columns=['Predicted'])
output

Unnamed: 0,Predicted
0,2039.02
1,2501.19
2,7796.17
3,3592.66
4,964.30
...,...
9995,1649.15
9996,9877.91
9997,2540.86
9998,2687.18


In [12]:
ids_int = ids_stored.astype('int64')
print(ids_stored)
print(ids_int)

output['Id'] = ids_int
#output = output.set_index=('Id')
print(output)

0           1
1           2
2           3
3           4
4           5
        ...  
9995     9996
9996     9997
9997     9998
9998     9999
9999    10000
Name: Id, Length: 10000, dtype: int64
0           1
1           2
2           3
3           4
4           5
        ...  
9995     9996
9996     9997
9997     9998
9998     9999
9999    10000
Name: Id, Length: 10000, dtype: int64
      Predicted     Id
0       2039.02      1
1       2501.19      2
2       7796.17      3
3       3592.66      4
4        964.30      5
...         ...    ...
9995    1649.15   9996
9996    9877.91   9997
9997    2540.86   9998
9998    2687.18   9999
9999    3396.51  10000

[10000 rows x 2 columns]


In [13]:
output = output.set_index('Id')
output

Unnamed: 0_level_0,Predicted
Id,Unnamed: 1_level_1
1,2039.02
2,2501.19
3,7796.17
4,3592.66
5,964.30
...,...
9996,1649.15
9997,9877.91
9998,2540.86
9999,2687.18


In [14]:
output.to_csv('intro_predictions.csv')

In [15]:
#improving out model by showing which features are important
feature_weight = randomForest.feature_importances_ 
cols = x_final.columns
weight_df = pd.DataFrame({'features':cols,'weight':feature_weight})
weight_df

Unnamed: 0,features,weight
0,dateCrawled,0.02007973
1,name,0.04939517
2,seller,1.178925e-07
3,offerType,0.0
4,abtest,0.003181222
5,vehicleType,0.03084394
6,yearOfRegistration,0.4951389
7,gearbox,0.004180452
8,powerPS,0.1914273
9,model,0.03173313


In [16]:
weight_df = weight_df.sort_values(by = 'weight', ascending = False)
weight_df

Unnamed: 0,features,weight
6,yearOfRegistration,0.4951389
8,powerPS,0.1914273
1,name,0.04939517
18,lastSeen,0.03333535
10,kilometer,0.03201339
9,model,0.03173313
13,brand,0.03085211
5,vehicleType,0.03084394
17,postalCode,0.0258417
0,dateCrawled,0.02007973


In [17]:
#drop the last 2 features that do not have that much impact on predicting 
weight_df = weight_df.drop(weight_df.index[-2:])
weight_df

Unnamed: 0,features,weight
6,yearOfRegistration,0.4951389
8,powerPS,0.1914273
1,name,0.04939517
18,lastSeen,0.03333535
10,kilometer,0.03201339
9,model,0.03173313
13,brand,0.03085211
5,vehicleType,0.03084394
17,postalCode,0.0258417
0,dateCrawled,0.02007973


In [18]:
imp_features = weight_df['features'].tolist()
imp_features

['yearOfRegistration',
 'powerPS',
 'name',
 'lastSeen',
 'kilometer',
 'model',
 'brand',
 'vehicleType',
 'postalCode',
 'dateCrawled',
 'monthOfRegistration',
 'notRepairedDamage',
 'dateCreated',
 'fuelType',
 'gearbox',
 'abtest',
 'seller']

In [19]:
train_df.columns

Index(['dateCrawled', 'name', 'seller', 'offerType', 'price', 'abtest',
       'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model',
       'kilometer', 'monthOfRegistration', 'fuelType', 'brand',
       'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode',
       'lastSeen'],
      dtype='object')

In [None]:
#Running Random Forest Model with new features 

x_final_new = x_final[imp_features]
RandomForestModel = RandomForestRegressor(criterion = 'absolute_error', random_state=1)
RandomForestModel.fit(x_final_new, y_final)

In [None]:
x_test_new = x_test[imp_features]
predicted_output = RandomForestModel.predict(x_test_new)

output = pd.DataFrame(predicted_output, columns=['Predicted'])
output['Id'] = ids_int
output = output.set_index('Id')
output

In [None]:
output.to_csv('intro_predictions_new.csv')