In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
port_train_df = pd.read_csv('train.csv')
port_test_df = pd.read_csv('test.csv')

In [37]:

all_data = pd.concat((port_train_df.loc[:,'portfolio_id':'status'],
                      port_test_df.loc[:,'portfolio_id':'status']))

In [38]:
from datetime import datetime

all_data['start_date']=all_data['start_date'].apply(lambda x: "{0:-08d}".format(x))

all_data['start_date']=all_data['start_date'].apply(lambda x: datetime.strptime(x,'%Y%m%d'))

In [39]:
all_data['sell_date']=all_data['sell_date'].apply(lambda x: "{0:-08d}".format(x))
all_data['sell_date']=all_data['sell_date'].apply(lambda x: datetime.strptime(x,'%Y%m%d'))

In [40]:

all_data['creation_date']=all_data['creation_date'].apply(lambda x: "{0:-08d}".format(x))
all_data['creation_date']=all_data['creation_date'].apply(lambda x: datetime.strptime(x,'%Y%m%d'))

In [41]:
from scipy.stats import skew
#get all the columns with numeric data types
numeric_feats =all_data.dtypes[(all_data.dtypes!='object') & (all_data.dtypes!='datetime64[ns]')].index

print numeric_feats

Index([u'sold', u'euribor_rate', u'libor_rate', u'bought'], dtype='object')


In [42]:
port_train_df["sold"] = np.log1p(port_train_df["sold"])
port_train_df["bought"]=np.log1p(port_train_df["bought"])
skewed_feats = port_train_df[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness

In [43]:
skewed_feats

sold           -4.774069
euribor_rate    1.626369
libor_rate      1.695345
bought         -4.778083
dtype: float64

In [44]:
all_data[numeric_feats] = np.log1p(all_data[numeric_feats])

In [45]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [46]:
objcolname=['pf_category','currency','type','country_code']
for col in objcolname:
    all_data[col]=le.fit_transform(all_data[col])

In [47]:
all_data["libor_rate"]=all_data["libor_rate"].fillna(all_data["libor_rate"].median())

#fill status missing value
all_data["status"]=all_data["status"].fillna(all_data["status"].median())

#fill missing value for indicator code
all_data["indicator_code"]=all_data["indicator_code"].fillna(all_data["indicator_code"].median())

#fill missing value for bought
all_data["bought"]=all_data["bought"].fillna(all_data["bought"].median())

all_data["hedge_value"]=all_data["hedge_value"].fillna(all_data["hedge_value"].median())

all_data["sold"].fillna(all_data["sold"].median(), inplace=True)


In [48]:
all_data['hedge_value']=all_data.hedge_value.astype(int)
all_data['indicator_code']=all_data.indicator_code.astype(int)
all_data['status']=all_data.status.astype(int)

In [49]:

all_data.dtypes

portfolio_id              object
desk_id                   object
office_id                 object
pf_category                int64
start_date        datetime64[ns]
sold                     float64
country_code               int64
euribor_rate             float64
currency                   int64
libor_rate               float64
bought                   float64
creation_date     datetime64[ns]
indicator_code             int32
sell_date         datetime64[ns]
type                       int64
hedge_value                int32
status                     int32
dtype: object

In [58]:
#all the data has been converted into datetime now we can calculate the duration

all_data['duration']=(all_data['sell_date']-all_data['start_date']).apply(lambda x:x.days)

In [59]:
all_data['duration'].head()

0    23
1    34
2    25
3    34
4    34
Name: duration, dtype: int64

In [56]:
data = all_data.ix[all_data['duration'] < 0]

In [57]:
data.shape

(154, 18)

In [60]:
all_data['duration']

0       23
1       34
2       25
3       34
4       34
5       34
6       34
7       34
8       92
9       90
10      35
11      34
12      34
13      31
14      93
15      34
16      34
17      34
18      38
19      34
20      34
21      35
22      32
23      34
24      96
25      34
26      34
27      34
28      34
29      32
        ..
4771    30
4772     7
4773    90
4774    30
4775    30
4776    33
4777    31
4778     7
4779     7
4780     7
4781     7
4782    63
4783     7
4784     3
4785    90
4786    33
4787    32
4788     7
4789    91
4790    34
4791    91
4792     7
4793    91
4794     7
4795    32
4796     7
4797     7
4798     7
4799     7
4800     7
Name: duration, Length: 14167, dtype: int64

In [61]:
all_data.dtypes

portfolio_id              object
desk_id                   object
office_id                 object
pf_category                int64
start_date        datetime64[ns]
sold                     float64
country_code               int64
euribor_rate             float64
currency                   int64
libor_rate               float64
bought                   float64
creation_date     datetime64[ns]
indicator_code             int32
sell_date         datetime64[ns]
type                       int64
hedge_value                int32
status                     int32
duration                   int64
dtype: object

In [62]:
all_new_data=all_data.drop(['start_date', 'creation_date', 'sell_date', 'desk_id','portfolio_id','office_id'],axis=1)

In [63]:
all_new_data.head(10)

Unnamed: 0,pf_category,sold,country_code,euribor_rate,currency,libor_rate,bought,indicator_code,type,hedge_value,status,duration
0,1,2.971234,2,0.020528,4,1.203638,2.971145,1,1,0,1,23
1,0,2.995222,1,0.020528,2,1.835715,2.995034,1,2,0,1,34
2,0,2.936475,2,0.020528,4,1.203638,2.936386,1,0,0,1,25
3,0,2.99174,2,0.020528,4,1.203638,2.991648,1,0,0,1,34
4,1,2.983804,2,0.020528,4,1.203638,2.98371,1,1,0,1,34
5,0,2.928632,2,0.020528,4,1.203638,2.928534,1,0,0,1,34
6,0,2.940204,2,0.020528,4,1.203638,2.940106,1,0,0,1,34
7,1,2.981372,2,0.020528,4,1.203638,2.981278,1,1,0,1,34
8,0,2.956653,2,0.020528,4,1.203638,2.956387,1,0,0,1,92
9,0,3.004411,2,0.020528,4,1.203638,3.004163,1,0,0,1,90


In [64]:
X_train = all_new_data[:port_train_df.shape[0]]

In [65]:
X_train.shape

(9366, 12)

In [66]:
y=port_train_df[['return']]

In [67]:
X_test=all_new_data[:port_test_df.shape[0]]

In [68]:
X_test.shape

(4801, 12)

In [69]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(random_state=7)
scores = cross_val_score(forest_reg, X_train, y, scoring='r2', cv=5)
print(scores)
print('mean r2:',np.mean(scores))

  estimator.fit(X_train, y_train, **fit_params)


[-0.85285656  0.9227486   0.87477596  0.40113382  0.8357438 ]
('mean r2:', 0.43630912485291962)


In [70]:
from IPython.display import FileLink

forest_reg = RandomForestRegressor(random_state=7)
forest_reg.fit(X_train, y)


  after removing the cwd from sys.path.


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=7,
           verbose=0, warm_start=False)

In [71]:
preds = forest_reg.predict(X_test)

sub = pd.DataFrame({'portfolio_id': port_test_df["portfolio_id"], 'return': preds})
filename = 'sub_returns.csv'
sub.to_csv(filename, index=False)
FileLink(filename)  # lb 0.94277

In [72]:
#apply decision tree with gridsearch

In [73]:
from sklearn.grid_search import GridSearchCV
from operator import itemgetter #get the item from tuple record 



In [90]:
# set of parameters to test
param_grid = {"min_samples_split": [2, 10, 20],
              "max_depth": [None, 2, 5, 10],
              "min_samples_leaf": [2, 5, 7],
              "max_leaf_nodes": [None, 5, 10, 20],
              }

In [91]:
from sklearn.tree import DecisionTreeRegressor

dt_model = DecisionTreeRegressor(max_depth = 5,min_samples_leaf=5)


In [92]:
from sklearn import model_selection
from sklearn.ensemble import BaggingRegressor
kfold = model_selection.KFold(n_splits=10, random_state=7)

bs_dt_model = BaggingRegressor(base_estimator=dt_model, n_estimators=100, random_state=7)
results = model_selection.cross_val_score(bs_dt_model, X_train, y, cv=kfold)
print(results.mean())

0.58814641616


In [94]:
bs_dt_model.fit(X_train,y)
preds = bs_dt_model.predict(X_test)

sub = pd.DataFrame({'portfolio_id': port_test_df["portfolio_id"], 'return': preds})
filename = 'sub_returns.csv'
sub.to_csv(filename, index=False)
FileLink(filename)  # lb 0.94277

In [95]:
dt_grid = DecisionTreeRegressor()

In [96]:
 gs_dt = GridSearchCV(dt_grid,
                               param_grid=param_grid,
                               cv=10)

In [97]:
gs_dt.fit(X_train, y)

GridSearchCV(cv=10, error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'min_samples_split': [2, 10, 20], 'max_leaf_nodes': [None, 5, 10, 20], 'max_depth': [None, 2, 5, 10], 'min_samples_leaf': [2, 5, 7]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [99]:
#print the max scores from the gird search

#sort the score based on the performance of the model
max_scores = sorted(gs_dt.grid_scores_,
                        key=itemgetter(1),
                        reverse=True)[:3]

#print the parameter of respective performance
for i, score in enumerate(max_scores):
        print("Model Rank: {0}".format(i + 1))
        print(("Mean validation score: "
               "{0:.3f} (std: {1:.3f})").format(
               score.mean_validation_score,
               np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

Model Rank: 1
Mean validation score: 0.595 (std: 0.549)
Parameters: {'min_samples_split': 20, 'max_leaf_nodes': None, 'max_depth': None, 'min_samples_leaf': 5}

Model Rank: 2
Mean validation score: 0.588 (std: 0.545)
Parameters: {'min_samples_split': 20, 'max_leaf_nodes': None, 'max_depth': 10, 'min_samples_leaf': 5}

Model Rank: 3
Mean validation score: 0.582 (std: 0.547)
Parameters: {'min_samples_split': 2, 'max_leaf_nodes': None, 'max_depth': None, 'min_samples_leaf': 7}



In [100]:
#based on gridsearch 
dt_model = DecisionTreeRegressor(min_samples_split = 5,min_samples_leaf=5)


In [101]:
dt_model.fit(X_train,y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=5, min_samples_split=5,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

In [102]:
dt_model.fit(X_train,y)
preds = bs_dt_model.predict(X_test)

sub = pd.DataFrame({'portfolio_id': port_test_df["portfolio_id"], 'return': preds})
filename = 'sub_returns.csv'
sub.to_csv(filename, index=False)
FileLink(filename)  # lb 0.94277