# Machine learning pipeline

### 0: Get the imports & import the data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

from sklearn.model_selection import cross_val_score


In [None]:
nepal = pd.read_csv("nepal_train.csv",sep=";")

### 1: Explore the data

We starten met een aantal zaken te bekijken over de data, om zo te weten te komen waarmee we te maken hebben.

In [None]:
nepal.info()

In [None]:
nepal.head()

In [None]:
nepal['foundation_type'].value_counts()

In [None]:
nepal.describe()

### 2: Clean the data & Feature engineering

In [None]:
nepal.isna().count()

We beginnen met het weggooien van het building_id

In [None]:
del nepal['building_id']

Het viel mij op dat er een aantal kolommen waren met heel weinig 'true' values. Hierdoor heb ik besloten deze weg te laten.

In [None]:
for kolom in nepal.columns:
    print(nepal[kolom].value_counts())
    print("\n")

In [None]:
for kolom in nepal.columns:
        unieke_waarden = nepal[kolom].unique()
        if set(unieke_waarden).issubset({0, 1}):
            aantal_eenen = (nepal[kolom] == 1).sum()
            if aantal_eenen < 500:
                nepal.drop(columns=kolom,inplace=True)

We maken nu het probleem binary en passen daarna one-hot-encoding toe.

In [None]:
nepal['damage_grade'] = nepal['damage_grade'].replace(2,1)

In [None]:
nepal = pd.get_dummies(nepal,dtype=int)

### 3 Training model

We zonderen eerst ons target af om daarna een train/test split te maken met een test data van 10%

In [None]:
y = nepal['damage_grade']
y = y.replace({1:0,3:1})
del nepal['damage_grade']

In [None]:
X_train, X_test, y_train,y_test = train_test_split(nepal,y,test_size=0.1,random_state=101)

We maken een randomforest aan en trainen dit met de data.

In [None]:
RF = RandomForestClassifier()
RF.fit(X_train,y_train)

In [None]:
RF_preds = RF.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, accuracy_score
cm = confusion_matrix(y_test,RF_preds)
CM_plot = ConfusionMatrixDisplay(confusion_matrix=cm, 
                                 display_labels=RF.classes_)
CM_plot.plot()

In [None]:
ac = accuracy_score(y_test,RF_preds)
ac

We zien dat ons model nu een score haalt van +/- 77%.

In [None]:
cv_scores = cross_val_score(RF, X_train, y_train, cv=5, scoring='roc_auc')

print( np.mean(cv_scores))


We zien hier een gemiddelde ROC-AUC score van +/- 0.78 wat niet perse goed of heel slecht hoeft te zijn.

In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score
import matplotlib.pyplot as plt

y_prob = RF.predict_proba(X_test)[:, 1]


# ROC curve berekenen
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = roc_auc_score(y_test, y_prob)

# ROC-curve plotten
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('FP rate')
plt.ylabel('TPrate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()



In [None]:
precision, recall, _ = precision_recall_curve(y_test, y_prob)

# Precision-Recall-curve plotten
plt.plot(recall, precision, label='PR curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.show()


### 4 Parametertuning

We passen parameter tuning toe om zo het best mogelijke model te vinden en dit te kunnen gebruiken.

In [None]:
if(False):
    X_train, X_test, y_train,y_test = train_test_split(nepal,y,test_size=0.90,random_state=101)
    from sklearn.model_selection import GridSearchCV

    param_grid = {
        'n_estimators': [50, 100, 200],       
        'max_depth': [None, 10, 20, 30],    
        'min_samples_split': [2, 5, 10],      
        'min_samples_leaf': [1, 2, 4],        
        }

    grid_search = GridSearchCV(estimator=RF, param_grid=param_grid, cv=5, scoring='roc_auc',n_jobs=10,verbose=2)

    grid_search.fit(X_train, y_train)
    
    best_RF = RandomForestClassifier(**grid_search.best_params_, random_state=101)

    best_RF.fit(X_train, y_train)

    best_RF_preds = best_RF.predict(X_test)

    best_accuracy = accuracy_score(y_test, best_RF_preds)
    print("Accuracy:", best_accuracy)

    cv_scores_best = cross_val_score(best_RF, X_train, y_train, cv=5, scoring='roc_auc')
    print("Gem ROC-AUC ", np.mean(cv_scores_best))


    from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score
    import matplotlib.pyplot as plt

    y_prob = RF.predict_proba(X_test)[:, 1]


    fpr, tpr, _ = roc_curve(y_test, y_prob)
    roc_auc = roc_auc_score(y_test, y_prob)

    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')  
    plt.xlabel('FP rate')
    plt.ylabel('TP rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show()

    precision, recall, _ = precision_recall_curve(y_test, y_prob)

    plt.plot(recall, precision, label='PR curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc="lower left")
    plt.show()


Ook hier zien we dat het model een accuracy haalt van +/- 78%
Ook bij de gemiddelde Roc-AUC score zien we geen grote verschillen

### 5 Use test data

In [None]:
nepalTest= pd.read_csv("nepal_test.csv",sep=";")
nepalTest['building_id']

for kolom in nepalTest.columns:
        unieke_waarden = nepalTest[kolom].unique()
        if set(unieke_waarden).issubset({0, 1}):
            aantal_eenen = (nepalTest[kolom] == 1).sum()
            if aantal_eenen < 500:
                nepalTest.drop(columns=kolom,inplace=True)


nepalTest['damage_grade'] = nepalTest['damage_grade'].replace(2,1)

nepalTest = pd.get_dummies(nepalTest,dtype=int)


y = nepalTest['damage_grade']
y = y.replace({1:0,3:1})
del nepalTest['damage_grade']

X_train, X_test, y_train,y_test = train_test_split(nepalTest,y,test_size=0.1,random_state=101)


RF.fit(X_train,y_train)

RF_preds = RF.predict(X_test)

In [None]:
ac = accuracy_score(y_test,RF_preds)
ac

In [None]:
cv_scores = cross_val_score(RF, X_train, y_train, cv=5, scoring='roc_auc')

print( np.mean(cv_scores))


We zien dat ons model zo goed als de zelfde score haalt.

In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score
import matplotlib.pyplot as plt

# Gebruik predict_proba om de kansvoorspelling voor de klasse 1 (positieve klasse) te krijgen
y_prob = RF.predict_proba(X_test)[:, 1]


# ROC curve berekenen
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = roc_auc_score(y_test, y_prob)

# ROC-curve plotten
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')  # Diagonale lijn voor willekeurige voorspellingen
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Precision-Recall curve berekenen
precision, recall, _ = precision_recall_curve(y_test, y_prob)

# Precision-Recall-curve plotten
plt.plot(recall, precision, label='PR curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.show()


In de curves zijn ook geen grote merkwaardige verschillen te vinden.