<a href="https://www.kaggle.com/code/thiagosalesfreireluz/gradientboosting?scriptVersionId=224344997" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_confusion_matrix
from mlxtend.classifier import StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn import metrics
from numpy import mean
from numpy import std
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
#import geneticalgorithmrf  #genetic algorithum module
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
data = pd.read_csv('../input/kepler-exoplanet-search-results/cumulative.csv')

In [None]:
# getting data
df = data

# Drop unused columns
df = df.drop(['rowid', 'kepid', 'kepoi_name', 'kepler_name', 'koi_pdisposition', 'koi_score'], axis=1)
  
# Limit target values to CANDIDATE and CONFIRMED
false_positive_rows = df.query("koi_disposition == 'FALSE POSITIVE'").index
df = df.drop(false_positive_rows, axis=0).reset_index(drop=True)

#Transforming target column in binary data
df['koi_disposition'] = df['koi_disposition'].map({"CANDIDATE":1,"CONFIRMED":0})
    
# Drop columns with all missing values
df = df.drop(['koi_teq_err1', 'koi_teq_err2'], axis=1)

# Fill remaining missing values
df['koi_tce_delivname'] = df['koi_tce_delivname'].fillna(df['koi_tce_delivname'].mode()[0])
for column in df.columns[df.isna().sum() > 0]:
    df[column] = df[column].fillna(df[column].mean())



# One-hot encode koi_tce_delivname column
delivname_dummies = pd.get_dummies(df['koi_tce_delivname'], prefix='delivname')
df = pd.concat([df, delivname_dummies], axis=1)
df = df.drop('koi_tce_delivname', axis=1)

# Split df into X and y
y = df['koi_disposition']
X = df.drop('koi_disposition', axis=1)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
# Scale X
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
X_train, X_test, y_train, y_test


In [None]:
clf = GradientBoostingClassifier()

In [None]:
clf.fit(X_train,y_train)

In [None]:
def get_classifications(y_test, y_pred, positive_label=1):
    tp = 0
    fn = 0
    fp = 0
    tn = 0
    
    for y_t, y_p in zip(y_test, y_pred):
        if y_t == positive_label:
            if y_p == positive_label:
                tp += 1
            else:
                fn += 1
        else:
            if y_p == positive_label:
                fp += 1
            else:
                tn += 1
    
    return tp, fn, fp, tn

def get_accuracy(tp, fn, fp, tn):
    acc = (tp + tn) / (tp + fn + fp + tn)
    return acc

def get_precision(tp, fn, fp, tn):
    precision = tp / (tp + fp)
    return precision

def get_recall(tp, fn, fp, tn):
    recall = tp / (tp + fn)
    return recall

def get_f1_score(tp, fn, fp, tn):
    precision = get_precision(tp, fn, fp, tn)
    recall = get_recall(tp, fn, fp, tn)
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

def get_sensitivity(tp, fn, fp, tn):
    sensitivity = tp / (tp + fn)
    return sensitivity

def get_specificity(tp, fn, fp, tn):
    specificity = tn / (tn + fp)
    return specificity

In [None]:
y_pred = clf.predict(X_test)
print(" Accuracy: {:.3f}%".format(get_accuracy(*get_classifications(y_test, y_pred)) * 100))

In [None]:
y_pred = clf.predict(X_test)
print(" Sensitivity: {:.3f}%".format(get_sensitivity(*get_classifications(y_test, y_pred)) * 100))

In [None]:
y_pred = clf.predict(X_test)
print(" Specificity: {:.3f}%".format(get_specificity(*get_classifications(y_test, y_pred)) * 100))

In [None]:
y_pred = clf.predict(X_test)
print(" Precision: {:.3f}%".format(get_precision(*get_classifications(y_test, y_pred)) * 100))

In [None]:
y_pred = clf.predict(X_test)
print(" F1 Score: {:.5f}".format(get_f1_score(*get_classifications(y_test, y_pred))))

In [None]:
y_pred = clf.predict(X_test)
cf_matrix = confusion_matrix(y_test, y_pred)
group_names = ['True Neg','False Pos','False Neg','True Pos']

group_counts = ["{0:0.0f}".format(value) for value in
                cf_matrix.flatten()]

group_percentages = ["{0:.2%}".format(value) for value in
                     cf_matrix.flatten()/np.sum(cf_matrix)]

labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]

labels = np.asarray(labels).reshape(2,2)

ax = sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues')

#ax.set_title('Confusion Matrix with labels\n\n');
ax.set_xlabel('Valores preditos pelo modelo')
ax.set_ylabel('Valores reais ');

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['False','True'])
ax.yaxis.set_ticklabels(['False','True'])

## Display the visualization of the Confusion Matrix.
plt.savefig('my_plot.JPEG')
plt.show()


In [None]:
"""numberOfParents = 8 #number of parents to start
numberOfParentsMating = 4 #number of parents that will mate
numberOfParameters = 2 #number of parameters that will be optimized
numberOfGenerations = 4 #number of genration that will be created
#define the population size


geneticalgorithmrf.printhello("hello world")

populationSize = (numberOfParents, numberOfParameters)
#initialize the population with randomly generated parameters
population = geneticalgorithmrf.initilialize_population(numberOfParents)
#define an array to store the fitness  history
fitnessHistory = np.empty([numberOfGenerations+1, numberOfParents])
#define an array to store the value of each parameter for each parent and generation
populationHistory = np.empty([(numberOfGenerations+1)*numberOfParents, numberOfParameters])

#insert the value of initial parameters in history
populationHistory[0:numberOfParents, :] = population
for generation in range(numberOfGenerations):
    print("This is number %s generation" % (generation))
    
    #train the dataset and obtain fitness
    fitnessValue = geneticalgorithmrf.train_population(population, X_train,  y_train, X_test, y_test)
    geneticalgorithmrf.printhello("hello world")
    fitnessHistory[generation, :] = fitnessValue
    
    #best score in the current iteration"""

In [None]:
#Best solution from the final iteration

"""fitness = geneticalgorithmrf.train_population(population, X_train,  y_train, X_test, y_test)
fitnessHistory[generation+1, :] = fitness

#index of the best solution
bestFitnessIndex = np.where(fitness == np.max(fitness))[0][0]

#Best fitness
print("Best fitness is =", fitness[bestFitnessIndex])

#Best parameters
print("Best parameters are:")
print('learning_rate', population[bestFitnessIndex][0])
print('max_depth', population[bestFitnessIndex][1])
 



#visualize the change in fitness of the various generations and parents


geneticalgorithmrf.plot_parameters(numberOfGenerations, numberOfParents, fitnessHistory, "fitness (F1-score)")

#Look at individual parameters change with generation
#Create array for each parameter history (Genration x Parents)


nEstimatorHistory = populationHistory[:, 1].reshape([numberOfGenerations+1, numberOfParents])
maxdepthHistory = populationHistory[:, 1].reshape([numberOfGenerations+1, numberOfParents])



#generate heatmap for each parameter


geneticalgorithmrf.plot_parameters(numberOfGenerations, numberOfParents, nEstimatorHistory, "n_estimator")
geneticalgorithmrf.plot_parameters(numberOfGenerations, numberOfParents, maxdepthHistory, "maximum depth")"""

In [None]:
import pandas as pd
feature_imp = pd.Series(clf.feature_importances_,index=X_train.columns).sort_values(ascending=False)
feature_imp

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()

In [None]:
import numpy as np

In [None]:
#n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
#print(n_estimators)

n_estimators = [100,400,100]

learning_rate = [0.1, 1, 5]

#max_features = ['auto', 'sqrt', 'log2']

# Maximum number of levels in tree
#max_depth = [4,6,8]
#max_depth.append(None)
#print (max_depth)

#criterion = ['friedman_mse', 'squared_error', 'mse']


In [None]:
# Exemplo dos parâmetros que quero testar.
parametros = {'n_estimators': n_estimators, 'learning_rate':learning_rate}

In [None]:
from sklearn.model_selection import  RandomizedSearchCV

In [None]:
random = RandomizedSearchCV(estimator = clf, 
                   param_distributions = parametros,  scoring = 'accuracy',
                   cv = 5, n_iter = 20)

In [None]:
import time
start_time = time.time()

In [None]:
# fit the model
random.fit(X_train, y_train)

In [None]:
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# Imprimindo os resultados.
pd.DataFrame(random.cv_results_)

In [None]:
# Imprime os parâmetros que produziram o ".best_score_".
random.best_params_

In [None]:
# Imprimindo o score.
random.best_score_