In [1]:
import pandas as pd
import numpy as np
dados = pd.read_csv('wealthfront.csv')
dados.head()

Unnamed: 0,DividendStocks,EmergingMarkets,ForeignStocks,MunicipalBonds,NaturalResources,RiskTolerance,USStocks,q1,q2,q3,q4,q5,q6,q7,q8
0,0.05,0.11,0.14,0.33,0.05,0.045,0.32,1,4,95,17940000.0,2,67714000.0,2,4
1,0.05,0.28,0.22,0.05,0.05,0.1,0.35,4,2,33,68527000.0,4,50352000.0,1,4
2,0.07,0.09,0.13,0.35,0.06,0.04,0.3,2,1,71,93291000.0,4,34119000.0,2,3
3,0.05,0.11,0.14,0.33,0.05,0.045,0.32,1,3,89,83735000.0,5,93350000.0,2,4
4,0.05,0.28,0.22,0.05,0.05,0.1,0.35,1,4,52,19398000.0,1,75739000.0,1,4


In [2]:
y = dados["RiskTolerance"].values
y = (y*1000).astype("int")

In [3]:
x = [col for col in dados if col.startswith('q')]
X = dados[x].values

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

knn_clf = KNeighborsClassifier()
log_clf = LogisticRegression(random_state=42)
tree_clf = DecisionTreeClassifier(max_depth=10, random_state=42)
svm_clf = SVC(random_state=42)

voting_clf = VotingClassifier(
    estimators=[('knn', knn_clf), ('lr', log_clf), ('tree', tree_clf), ('svc', svm_clf)],
    voting='hard')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1,...f',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [6]:
from sklearn.model_selection import cross_val_score

for clf, label in zip([knn_clf, log_clf, tree_clf, svm_clf, voting_clf], ['KNN', 'Reg Logistica', 'tree', 'SVM', 'Ensemble']):
    scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.17 (+/- 0.01) [KNN]
Accuracy: 0.24 (+/- 0.00) [Reg Logistica]
Accuracy: 0.96 (+/- 0.00) [tree]
Accuracy: 0.24 (+/- 0.00) [SVM]
Accuracy: 0.26 (+/- 0.00) [Ensemble]


In [7]:
svm_clf = SVC(random_state=42, probability=True)

voting_clf = VotingClassifier(
    estimators=[('knn', knn_clf), ('lr', log_clf), ('tree', tree_clf), ('svc', svm_clf)],
    voting='soft')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1,...bf',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [None]:
from sklearn.model_selection import cross_val_score

for clf, label in zip([knn_clf, log_clf, tree_clf, svm_clf, voting_clf], ['KNN', 'Reg Logistica', 'tree', 'SVM', 'Ensemble']):
    scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.17 (+/- 0.01) [KNN]
Accuracy: 0.24 (+/- 0.00) [Reg Logistica]
Accuracy: 0.96 (+/- 0.00) [tree]
Accuracy: 0.24 (+/- 0.00) [SVM]
Accuracy: 0.94 (+/- 0.00) [Ensemble]


In [None]:
from sklearn.metrics import classification_report
for clf in (knn_clf, log_clf, tree_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, '\n\n', classification_report(y_test, y_pred))

KNeighborsClassifier 

              precision    recall  f1-score   support

          5       0.21      0.50      0.29        10
         10       0.00      0.00      0.00         2
         15       0.00      0.00      0.00         2
         20       0.00      0.00      0.00         1
         25       0.00      0.00      0.00         5
         30       0.04      0.10      0.06        10
         35       0.08      0.15      0.10        89
         40       0.21      0.36      0.27       234
         45       0.13      0.11      0.12       152
         50       0.00      0.00      0.00         2
         55       0.00      0.00      0.00         2
         60       0.00      0.00      0.00         2
         65       0.00      0.00      0.00         5
         70       0.03      0.01      0.02        76
         75       0.00      0.00      0.00         6
         80       0.17      0.12      0.14       147
         85       0.17      0.05      0.08        75
         90       0.0

  'precision', 'predicted', average, warn_for)


LogisticRegression 

              precision    recall  f1-score   support

          5       0.43      0.30      0.35        10
         10       0.00      0.00      0.00         2
         15       0.00      0.00      0.00         2
         20       0.00      0.00      0.00         1
         25       0.00      0.00      0.00         5
         30       0.00      0.00      0.00        10
         35       0.00      0.00      0.00        89
         40       0.24      1.00      0.39       234
         45       0.00      0.00      0.00       152
         50       0.00      0.00      0.00         2
         55       0.00      0.00      0.00         2
         60       0.00      0.00      0.00         2
         65       0.00      0.00      0.00         5
         70       0.00      0.00      0.00        76
         75       0.00      0.00      0.00         6
         80       0.00      0.00      0.00       147
         85       0.00      0.00      0.00        75
         90       0.00 

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_small = RandomForestClassifier(n_estimators=10, max_depth = 3)
rf_small.fit(X_train, y_train)

In [None]:
tree_small = rf_small.estimators_[1]

In [32]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 200, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 20, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.6min


In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy
base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(train_features, train_labels)
base_accuracy = evaluate(base_model, test_features, test_labels)

best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, test_features, test_labels)

print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))