In [24]:
# Supress unnecessary warnings so that presentation looks clean
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

print ("Reading dataset...")
#Read the train dataset
dataset = pd.read_csv('../input/trainv4.csv')
#Read test dataset
dataset_test = pd.read_csv('../input/testv4.csv')

print ("Length of dataset = " + str(len(dataset)))

## cat117 is cont 2
## cat 118 is High vs. Low
print ("Factorizing categorical variables...")
features = dataset.columns
cats = [feat for feat in features if 'cat' in feat]
## print cats
for feat in cats:
    dataset[feat] = pd.factorize(dataset[feat], sort=True)[0]
    dataset_test[feat] = pd.factorize(dataset_test[feat], sort=True)[0]
print ("Finished loading and factorized data.")

Reading dataset...
Length of dataset = 188318
Factorizing categorical variables...
Finished loading and factorized data.


In [25]:
print ("Preprocessing the Data...")
## Response
shift = 200
response = np.log(dataset['loss'].values + shift)

## Drop the response from our dataset
dataset = dataset.drop(['loss', 'id'], 1)

print "Responses:"
print response

Preprocessing the Data...
Responses:
[ 7.78870066  7.30222685  8.07249545 ...,  8.69326862  7.47469844
  8.50749027]


In [16]:
print("-")*50
print "Random Forest Algo with KFold"
print("-")*50
#Evaluation of various combinations of RandomForest
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn import cross_validation
from sklearn.cross_validation import KFold

print ("Creating Training and Validation sets...")
#get the number of rows and columns

seed = 0

## CODE FOR 1-FOLD VALIDATION
## FOR REFERENCE:
##X_train is 90% training features
##Y_train is 90% training responses

##X_val is the 10% validation features
##Y_val is the 10% validation responses

#X_train, X_val, Y_train, Y_val = cross_validation.train_test_split(dataset, response, test_size=.1, random_state=seed)
#print "Length of Validation Set:" + str(len(Y_val))
#print "Length of Train Set:" + str(len(Y_train))
#print "Ratio: " + str(len(Y_val) / float(len(Y_train)))


## K-Fold Validation
print ("Creating K-fold validation dataset indices")
n_folds = 10
kf = KFold(dataset.shape[0], n_folds=n_folds, shuffle = True)
pred_test = 0
temp_cv_score = []
cv_loss = pd.DataFrame(columns=["id","loss"])

for i, (train_index, test_index) in enumerate(kf):
    print "-" * 80
    print('\n Fold %d' % (i + 1))

    X_train, X_val = dataset.iloc[train_index], dataset.iloc[test_index]
    Y_train, Y_val = pd.DataFrame(response).iloc[train_index], pd.DataFrame(response).iloc[test_index]
    print "Training size: " + str(len(X_train))
    print "Validation size: " + str(len(X_val))
    print "Total size: " + str(len(X_train) + len(X_val))
    
    seed = 0
    print "Training Random Forest Model..."
    ## n_jobs=-1: Use all cores
    ## n_estimators: Create random forest of 50 trees
    model = RandomForestRegressor(n_jobs=-1,n_estimators=50,random_state=seed)
    model.fit(X_train, Y_train)
    
    results = np.exp(model.predict(X_val)) - 200
    print pd.DataFrame(results)
    pd.DataFrame(results).to_csv("results.csv", index=False)
    
    result = mean_absolute_error(np.exp(Y_val), np.exp(model.predict(X_val)))
    print result
    
    #pred_cv = np.exp(model.predict(xgtrain_2, ntree_limit=model.best_ntree_limit)) - shift
    #pred_test += np.exp(model.predict(xgtest, ntree_limit=model.best_ntree_limit)) - shift
    
    #cv_loss = pd.concat([cv_loss, pd.DataFrame({"id": ids_train[test_index], "loss": pred_cv})])
    #print ('\n Fold %d' % (i + 1) + ' score: ' + str(temp_cv_score[i]))




--------------------------------------------------
Random Forest Algo
--------------------------------------------------
Creating Training and Validation sets...
Creating K-fold validation dataset indices
--------------------------------------------------------------------------------

 Fold 1
Training size: 169486
Validation size: 18832
Total size: 188318
Training Random Forest Model...
                  0
0       2099.085924
1       5773.024756
2       5827.145526
3       5122.080744
4       1232.075923
5       1174.225752
6       1553.517917
7       1572.171319
8      11029.385823
9       1419.701414
10      2943.128782
11      3019.107012
12      1576.751746
13      1531.697919
14      1042.390909
15      2343.188155
16      1164.227849
17      2582.758419
18      4534.713080
19      4178.795570
20       867.635856
21      2838.431772
22      1621.035076
23      1671.608249
24      2279.997800
25      3190.071510
26      1744.246875
27      2087.870691
28      4210.665939
29      1

In [44]:
print("-")*50
print "Random Forest Algo with 1 Fold Validation"
print("-")*50
#Evaluation of various combinations of RandomForest
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn import cross_validation
from sklearn.cross_validation import KFold

print ("Creating Training and Validation sets...")
#get the number of rows and columns

seed = 0

## CODE FOR 1-FOLD VALIDATION
## FOR REFERENCE:
#X_train is 90% training features
#Y_train is 90% training responses

#X_val is the 10% validation features
#Y_val is the 10% validation responses

X_train, X_val, Y_train, Y_val = cross_validation.train_test_split(dataset, response, test_size=.1, random_state=seed)
print "Length of Validation Set:" + str(len(Y_val))
print "Length of Train Set:" + str(len(Y_train))
print "Ratio: " + str(len(Y_val) / float(len(Y_train)))

# X_train, X_val = dataset.iloc[train_index], dataset.iloc[test_index]
# Y_train, Y_val = pd.DataFrame(response).iloc[train_index], pd.DataFrame(response).iloc[test_index]
# print "Training size: " + str(len(X_train))
# print "Validation size: " + str(len(X_val))
# print "Total size: " + str(len(X_train) + len(X_val))
    
seed = 0
print "Training Random Forest Model..."
## n_jobs=-1: Use all cores
## n_estimators: Create random forest of 50 trees
model = RandomForestRegressor(n_jobs=-1,n_estimators=50,random_state=seed)
model.fit(X_train, Y_train)
    
results = np.exp(model.predict(X_val)) - 200
print pd.DataFrame(results)
# pd.DataFrame(results).to_csv("results.csv", index=False)
    
result = mean_absolute_error(np.exp(Y_val), np.exp(model.predict(X_val)))
print result



--------------------------------------------------
Random Forest Algo with 1 Fold Validation
--------------------------------------------------
Creating Training and Validation sets...
Length of Validation Set:18832
Length of Train Set:169486
Ratio: 0.111112422265
Training Random Forest Model...
                 0
0      2263.837190
1      1587.689686
2      2266.394814
3       802.177775
4      1359.087911
5      7156.459036
6      2236.207757
7       396.107984
8      1860.375611
9      1676.215522
10     1512.157742
11      755.584314
12     2949.846929
13     6384.913769
14     4679.451722
15     1735.092396
16     5300.755750
17     3294.023014
18     1260.103092
19     3700.469004
20     4257.938740
21     2925.710972
22     4192.299493
23     2998.320709
24     1686.947203
25     2550.887791
26      665.775373
27     5539.480664
28     1772.818627
29     1850.492140
...            ...
18802  1655.082751
18803  1076.742632
18804  1858.913327
18805  1306.666989
18806  4119.335515


In [43]:


model.predict(X_val)

result = mean_absolute_error(np.exp(Y_val), np.exp(model.predict(X_val)))
print result


1199.52252619


In [39]:
print(np.max(np.exp(Y_val)))
print(np.max(np.exp(model.predict(X_val))))

print(np.min(np.exp(Y_val)))
print(np.min(np.exp(model.predict(X_val))))

79823.52
74284.6499438
208.4
212.468673402


In [45]:
predictions = np.exp(model.predict(dataset_test.drop(['id'],1))) - shift
pd.DataFrame(test_results).to_csv("test_results.csv", index=False)

In [46]:
final = pd.DataFrame({"id": dataset_test['id'], "loss": predictions})
print final
final.to_csv("predictions.csv", index=False)

            id          loss
0            4   1564.566853
1            6   1867.538184
2            9  17609.892638
3           12   8705.106719
4           15    667.158580
5           17   4467.462705
6           21   1587.517670
7           28   1082.680701
8           32   3196.317715
9           43   8423.991231
10          46   3785.479402
11          50   1257.128050
12          54   1015.295290
13          62   1819.752481
14          70   2082.138470
15          71   9916.939520
16          75   3882.482637
17          77   2714.188691
18          81   5479.230856
19          83   2339.194765
20          87   2300.101534
21          97   1769.455781
22         103   1392.603501
23         119   1318.940507
24         120   1837.763600
25         127   1138.647078
26         138   3378.242777
27         141   2632.653634
28         148    952.278967
29         150   3253.852208
...        ...           ...
125516  587482   1396.866419
125517  587484   4719.304256
125518  587489