### Load Probas

In [1]:
# Load probas:
import pickle
import numpy as np
from os import listdir
from os.path import isfile, join

from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

path = 'data/proba/'

# Feature eng:
xgb_fe_val_proba = pickle.load(open(path + 'xgb_fe_val_proba.pkl', 'rb'))    # dimmention verified
xgb_fe_train_proba = pickle.load(open(path + 'xgb_fe_train_proba.pkl', 'rb')) # dimmention verified
xgb_fe_test_proba = pickle.load(open(path + 'xgb_fe_test_proba.pkl', 'rb')) # dimmention verified

svm_fe_train_proba = pickle.load(open(path + 'svm_fe_train_proba.pkl', 'rb')) # dimmention verified
svm_fe_val_proba =  pickle.load(open(path + 'svm_fe_val_proba.pkl', 'rb'))    # dimmention verified
svm_fe_test_proba =  pickle.load(open(path +  'svm_fe_test_proba.pkl', 'rb')) # dimmention verified

rfc_fe_val_proba = pickle.load(open(path + 'rfc_fe_val_proba.pkl', 'rb'))     # dimmention verified
rfc_fe_train_proba = pickle.load(open(path + 'rfc_fe_train_proba.pkl', 'rb')) # dimmention verified
rfc_fe_test_proba = pickle.load(open(path + 'rfc_fe_test_proba.pkl', 'rb'))   # dimmention verified

# Bag of words:
rfc_bow_val_proba = pickle.load(open(path + 'rfc_bow_val_proba.pkl', 'rb'))      # dimmention verified
rfc_bow_test_proba = pickle.load(open(path + 'rfc_bow_test_proba.pkl', 'rb'))    # dimmention verified
rfc_bow_train_proba = pickle.load(open(path +  'rfc_bow_train_proba.pkl', 'rb')) # dimmention verified

xgb_bow_train_proba = pickle.load(open(path + 'xgb_bow_train_proba.pkl', 'rb')) #  bad dim (50,12)
xgb_bow_test_proba = pickle.load(open(path +  'xgb_bow_test_proba.pkl', 'rb'))
xgb_bow_val_proba =  pickle.load(open(path + 'xgb_bow_val_proba.pkl', 'rb'))

# tfidf
rfc_tfidf_train_proba = pickle.load(open(path +  'rfc_tfidf_train_proba.pkl', 'rb')) # dimmention verified
rfc_tfidf_test_proba =  pickle.load(open(path +  'rfc_tfidf_test_proba.pkl', 'rb'))  # dimmention verified
rfc_tfidf_val_proba =  pickle.load(open(path +  'rfc_tfidf_val_proba.pkl', 'rb'))    # dimmention verified

xgb_tfidf_test_proba = pickle.load(open(path + 'xgb_tfidf_test_proba.pkl', 'rb')) # bad dim (50,12)
xgb_tfidf_val_proba = pickle.load(open(path + 'xgb_tfidf_val_proba.pkl', 'rb'))
xgb_tfidf_train_proba = pickle.load(open(path + 'xgb_tfidf_train_proba.pkl', 'rb'))

y_test = np.load('data/y_test.npy')
y_train = np.load('data/y_train.npy')
y_val = np.load('data/y_val.npy')

#### Save predictions (for ranking)

In [18]:
def retrieve_prediction(proba, y):
    """
    Gets a list of prediction probabilities matrices and calcuate their confidance values
    
    Args:
        proba(np.array): the predictions probability matrices (np.array)
         y(np.array): labels (groud truth)
    Returns:
        (np.array). a list of confidance values (accuracy score)
    """
       
    prediction = proba.argmax(axis=1)
    
    return prediction

In [2]:
train_probas =[xgb_fe_train_proba, svm_fe_train_proba, rfc_fe_train_proba, 
               rfc_bow_train_proba, xgb_bow_train_proba, rfc_tfidf_train_proba, 
               xgb_tfidf_train_proba]

val_probas =[xgb_fe_val_proba, svm_fe_val_proba, rfc_fe_val_proba, 
               rfc_bow_val_proba, xgb_bow_val_proba, rfc_tfidf_val_proba, 
               xgb_tfidf_val_proba]

test_probas =[xgb_fe_test_proba, svm_fe_test_proba, rfc_fe_test_proba, 
               rfc_bow_test_proba, xgb_bow_test_proba, rfc_tfidf_test_proba, 
               xgb_tfidf_test_proba]


In [19]:
xgb_fe_val_pred = retrieve_prediction(xgb_fe_val_proba, y_val)
svm_fe_val_pred = retrieve_prediction(svm_fe_val_proba, y_val)
rfc_fe_val_pred = retrieve_prediction(rfc_fe_val_proba, y_val)
rfc_bow_val_pred = retrieve_prediction(rfc_bow_val_proba, y_val)
xgb_bow_val_pred = retrieve_prediction(xgb_bow_val_proba, y_val)
rfc_tfidf_val_pred = retrieve_prediction(rfc_tfidf_val_proba, y_val)
xgb_tfidf_val_pred = retrieve_prediction(xgb_tfidf_val_proba, y_val)

np.save('xgb_fe_val_pred.npy', xgb_fe_val_pred)
np.save('svm_fe_val_pred.npy', svm_fe_val_pred)
np.save('rfc_fe_val_pred.npy', rfc_fe_val_pred)
np.save('xgb_fe_val_pred.npy', xgb_fe_val_pred)
np.save('rfc_bow_val_pred.npy', rfc_bow_val_pred)
np.save('xgb_bow_val_pred.npy', xgb_bow_val_pred)
np.save('rfc_tfidf_val_pred.npy', rfc_tfidf_val_pred)
np.save('xgb_tfidf_val_pred.npy', xgb_tfidf_val_pred)

#### Ensenble methods:

In [3]:
def equally_ebsemble_results(predictions_probas):
    """
    Ensbale prediction result with equal weight
    
    Args:
        predictions_probas(list): A list of the predictions probability matrices (np.array)
    
    Returns:
        np.array. The predictions for the equally ensambled models
    """
    sum_predictions_prob = np.zeros(predictions_probas[0].shape)
    
    for curr_pred in predictions_probas:
        sum_predictions_prob += curr_pred
    
    ensemble_prob = np.divide(sum_predictions_prob, len(predictions_probas))
        
    return np.argmax(ensemble_prob, axis=1)


def calculate_confidance_val(predictions_probas, y):
    """
    Gets a list of prediction probabilities matrices and calcuate their confidance values
    
    Args:
         predictions_probas(list): A list of the predictions probability matrices (np.array)
         y(np.array): labels (groud truth)
    Returns:
        (list). a list of confidance values (accuracy score)
    """
    confidance_values = []
    
    for proba in predictions_probas:
        prediction = proba.argmax(axis=1)
        confidance_values.append(accuracy_score(y, prediction))
    
    return confidance_values
    
    
def weighted_ebsemble_results(predictions_probas, confidance_values):
    """
    Ensbale prediction result with weight according to each model confidance value
    
    Args:
        predictions_probas(list): A list of the predictions probability matrices (np.array)
    
    Returns:
        np.array. The predictions for the equally ensambled models
    """
    sum_predictions_prob = np.zeros(predictions_probas[0].shape)
    
    for curr_pred, w in zip(predictions_probas, confidance_values):
        sum_predictions_prob += w * curr_pred
    
    ensemble_prob = np.divide(sum_predictions_prob, sum(confidance_values))
        
    return np.argmax(ensemble_prob, axis=1)


#### Stacking methods:

In [4]:
from sklearn.linear_model import LogisticRegression

def stack_probas(predict_probas):
    """
    """
    for ind, curr_predic_proba in enumerate(predict_probas):
        if ind == 0:
            stacked_proba = curr_predic_proba
        else:
            stacked_proba = np.concatenate((stacked_proba, curr_predic_proba), axis=1)
    return stacked_proba

def stack_models(train_probas, val_probas, y_train, y_val, estimator, test_probas=None, y_test=None):
    """
    """
    
    stacked_proba_train = stack_probas(train_probas)
    stacked_proba_val = stack_probas(val_probas)
   
    
    model = estimator
    
    model.fit(stacked_proba_train, y_train)
    
    stacked_train_pred = model.predict(stacked_proba_train)
    stacked_val_pred = model.predict(stacked_proba_val)
    
    train_accuracy = accuracy_score(stacked_train_pred, y_train)
    val_accuracy = accuracy_score(stacked_val_pred, y_val)
    
    if y_test.any():
        stacked_proba_test = stack_probas(test_probas)
        stacked_test_pred = model.predict(stacked_proba_test)
        test_accuracy = accuracy_score(y_test,stacked_test_pred)
    else:
        test_accuracy = None
        
    return (train_accuracy, val_accuracy, test_accuracy), stacked_train_pred, stacked_val_pred, stacked_test_pred

### Ensembale

In [23]:
# Equaly wiegthed ensamble:
train_eq_ensemble_pred = equally_ebsemble_results(train_probas)
val_eq_ensemble_pred = equally_ebsemble_results(val_probas)
test_eq_ensemble_pred = equally_ebsemble_results(test_probas)

print("Equally weighted ensemble:")
print("--------------------------")
print("Train accuracy: ", accuracy_score(y_train, train_eq_ensemble_pred))
print("Val accuracy: ", accuracy_score(y_val, val_eq_ensemble_pred))
print("Test accuracy: ", accuracy_score(y_test, test_eq_ensemble_pred))

np.save('train_eq_ensemble_pred.npy',train_eq_ensemble_pred)
np.save('val_eq_ensemble_pred.npy',val_eq_ensemble_pred)
np.save('test_eq_ensemble_pred.npy',test_eq_ensemble_pred)

Equally weighted ensemble:
--------------------------
Train accuracy:  0.9975317875841436
Val accuracy:  0.6653311201419495
Test accuracy:  0.6696364736292141


In [22]:
# Equaly wiegthed ensamble:
cofidance_train = calculate_confidance_val(train_probas, y_train)
cofidance_val = calculate_confidance_val(val_probas, y_val)
cofidance_test = calculate_confidance_val(test_probas, y_test)

train_w_ensemble_pred = weighted_ebsemble_results(train_probas, cofidance_train )
val_w_ensemble_pred = weighted_ebsemble_results(val_probas,cofidance_val )
test_w_ensemble_pred = weighted_ebsemble_results(test_probas, cofidance_test)

print("Weighted ensemble:")
print("--------------------------")
print("Train accuracy: ", accuracy_score(y_train, train_eq_ensemble_pred))
print("Val accuracy: ", accuracy_score(y_val, val_eq_ensemble_pred))
print("Test accuracy: ", accuracy_score(y_test, test_eq_ensemble_pred))

np.save('train_w_ensemble_pred.npy',train_w_ensemble_pred)
np.save('val_w_ensemble_pred.npy',val_w_ensemble_pred)
np.save('test_w_ensemble_pred.npy',test_w_ensemble_pred)

Weighted ensemble:
--------------------------
Train accuracy:  0.9975317875841436
Val accuracy:  0.6653311201419495
Test accuracy:  0.6696364736292141


### Stacking

In [15]:
logreg = LogisticRegression()
rfc = RandomForestClassifier(n_estimators=100, max_depth=50)

stacking_accuracy_logreg, train_stack_logreg_pred, val_stack_logreg_pred, test_stack_logreg_pred =  \
                stack_models(train_probas, val_probas, y_train, y_val,logreg, test_probas, y_test)
    
stacking_accuracy_rfc, train_stack_rfc_pred, val_stack_rfc_pred, test_stack_rfc_pred =  \
                stack_models(train_probas, val_probas, y_train, y_val, rfc, test_probas, y_test)
    
np.save('train_stack_logreg_pred.npy',train_stack_logreg_pred)
np.save('val_stack_logreg_pred.npy',val_stack_logreg_pred)
np.save('test_stack_logreg_pred.npy',test_stack_logreg_pred)

np.save('train_stack_rfc_pred.npy',train_stack_logreg_pred)
np.save('val_stack_rfc_pred.npy',val_stack_logreg_pred)
np.save('test_stack_rfc_pred.npy',test_stack_logreg_pred)

In [6]:
stacking_accuracy_logreg, stacking_accuracy_rfc

((0.9979618548990277, 0.6687653826340794, 0.6727712698674586),
 (0.9979618548990277, 0.6582336443248812, 0.661826981246219))

In [14]:
logreg = LogisticRegression()
rfc = RandomForestClassifier(n_estimators=200, max_depth=20)

# stacking_accuracy_logreg, train_stack_pred, val_stack_pred, test_stack_pred =  \
#                 stack_models(train_probas, val_probas, y_train, y_val,logreg, test_probas, y_test)
    
stacking_accuracy_rfc, train_stack_pred, val_stack_pred, test_stack_pred =  \
                stack_models(train_probas, val_probas, y_train, y_val, rfc, test_probas, y_test)

In [13]:
stacking_accuracy_rfc

(0.9979992520568437, 0.6666475874305993, 0.6690865093768905)

In [69]:
logreg.get_params

<bound method BaseEstimator.get_params of LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)>