In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor

import xgboost as xgb
from xgboost import XGBClassifier

In [None]:
pwd

In [None]:
train = pd.read_csv('./AnnualReturns/train.csv')
test = pd.read_csv('./AnnualReturns/test.csv')

In [None]:
train_objs_num = len(train)
train = pd.concat(objs=[train, test], axis=0)

In [None]:
train.shape

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train['return'].describe()

In [None]:
sns.distplot(train['return']*100, kde=False, bins=100)

In [None]:
#sns.jointplot(x='sell_date',y='return',data=train)

In [None]:
train['country_code'].value_counts()

In [None]:
#sns.jointplot(x='office_id',y='return',data=train)

In [None]:
train.columns

In [None]:
#Missing Data
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

In [None]:
#train['libor_rate'].value_counts()

In [None]:
#add missing data
train['hedge_value'].fillna(False, inplace=True)
train['indicator_code'].fillna(False, inplace=True)
train['status'].fillna(False, inplace=True)
train['libor_rate'].fillna(train['libor_rate'].mean(), inplace=True)
train['sold'].fillna(0, inplace=True)
train['bought'].fillna(0, inplace=True)

In [None]:
train['start_date'] = pd.to_datetime(train['start_date'],format='%Y%m%d')
train['creation_date'] = pd.to_datetime(train['creation_date'],format='%Y%m%d')
train['sell_date'] = pd.to_datetime(train['sell_date'],format='%Y%m%d')

In [None]:
train.head()

In [None]:
#((train['sell_date'] - train['start_date']) / np.timedelta64(1, 'D')).astype(int)

In [None]:
#calculate days of investment
train['days_invested'] = ((train['sell_date'] - train['start_date']) / np.timedelta64(1, 'D')).astype(int)

In [None]:
#calculate amount earned on investment
train['amount_earned'] = (train['sold'] - train['bought'])

In [None]:
train.columns

In [None]:
train.drop(['desk_id','start_date','sold','bought','creation_date','sell_date'],axis=1,inplace=True)

In [None]:
train.head()

In [None]:
train.info()

In [None]:
sns.jointplot(x='amount_earned',y='return',data=train)

In [None]:
train_features = [x for x in train.columns if x not in ['transaction_id','target']]

In [None]:
#Encode the data
for column in train_features:
    if train[column].dtype == type(object):
        le = LabelEncoder()
        le.fit(train[column])
        train[column] = le.transform(train[column])

In [None]:
cat_cols = ['office_id', 'pf_category', 'country_code', 'currency', 'type']

In [None]:
dataset_preprocessed = pd.get_dummies(train, columns = cat_cols)

In [None]:
train_features = [x for x in dataset_preprocessed.columns if x not in ['portfolio_id','return']]

In [None]:
dataset_preprocessed.head()

In [None]:
train_preprocessed = dataset_preprocessed[:train_objs_num]
test_preprocessed = dataset_preprocessed[train_objs_num:]

In [None]:
X = train_preprocessed[train_features].values
y = train_preprocessed.loc[:,'return'].values
T = test_preprocessed[train_features].values

In [None]:
y

In [None]:
from sklearn.preprocessing import StandardScaler
normalizer = StandardScaler()
normalizer.fit_transform(X)

In [None]:
y

In [None]:
stacker = RandomForestRegressor(n_estimators=150,random_state=0)
results = cross_val_score(stacker, X, y, cv=5, scoring='r2')
print(results)
print("Stacker score: {} ".format(results.mean()))