 # Boost of Power

 Add some power to your fraud detection algorithm by using a GradientTreeClassifier to leverage ensemble learning.

In [4]:
!pip install pydotplus

You should consider upgrading via the 'c:\users\will\appdata\local\programs\python\python38\python.exe -m pip install --upgrade pip' command.


In [5]:
# Y is c column
# drop it


# Initial imports
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier

# Needed for decision tree visualization
import pydotplus
from IPython.display import Image

 ### Preprocessing

In [7]:
# Loading data
file_path = "data/pfizer_combined.csv"
df_loans = pd.read_csv(file_path)
df_loans.head()

Unnamed: 0,compound,positive,negative,neutral,c,h,l,o,v
0,-0.02382,0.0496,0.0564,0.894,0.0,39.24,38.67,38.72,11193319.0
1,-0.070842,0.035917,0.0455,0.918583,0.0,39.0,38.7,38.82,10206988.0
2,0.08678,0.0387,0.0094,0.9519,0.0,39.13,38.68,39.12,14328671.0
3,0.135414,0.052714,0.011,0.936286,1.0,39.22,38.75,38.76,12580358.0
4,-0.108008,0.025917,0.057417,0.916917,0.0,39.27,38.79,39.27,15754368.0


In [8]:
# Define features set
X = df_loans.copy()
X.drop("c", axis=1, inplace=True)
X.head()

Unnamed: 0,compound,positive,negative,neutral,h,l,o,v
0,-0.02382,0.0496,0.0564,0.894,39.24,38.67,38.72,11193319.0
1,-0.070842,0.035917,0.0455,0.918583,39.0,38.7,38.82,10206988.0
2,0.08678,0.0387,0.0094,0.9519,39.13,38.68,39.12,14328671.0
3,0.135414,0.052714,0.011,0.936286,39.22,38.75,38.76,12580358.0
4,-0.108008,0.025917,0.057417,0.916917,39.27,38.79,39.27,15754368.0


In [9]:
# Define target vector
y = df_loans["c"].values
y[:5]

array([[0.],
       [0.],
       [0.],
       [1.],
       [0.]])

In [5]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [6]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [7]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [8]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

 ### Choose Optimal Learning Rate

In [9]:
# Choose learning rate
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    model = GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=learning_rate,
        max_features=2,
        max_depth=3,
        random_state=0)
    model.fit(X_train_scaled,y_train.ravel())
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        model.score(
            X_train_scaled,
            y_train.ravel())))
    print("Accuracy score (validation): {0:.3f}".format(
        model.score(
            X_test_scaled,
            y_test.ravel())))
    print()

Learning rate:  0.05
Accuracy score (training): 0.683
Accuracy score (validation): 0.657

Learning rate:  0.1
Accuracy score (training): 0.716
Accuracy score (validation): 0.670

Learning rate:  0.25
Accuracy score (training): 0.856
Accuracy score (validation): 0.764

Learning rate:  0.5
Accuracy score (training): 0.926
Accuracy score (validation): 0.821

Learning rate:  0.75
Accuracy score (training): 0.928
Accuracy score (validation): 0.819

Learning rate:  1
Accuracy score (training): 0.927
Accuracy score (validation): 0.844



 ### Build Model with Optimal Learning Rate

In [10]:
# Create GradientBoostingClassifier model
model = GradientBoostingClassifier(
    n_estimators=500,
    learning_rate=1,
    max_features=5,
    max_depth=3,
    random_state=0)

# Fit the model
model.fit(X_train_scaled,y_train.ravel())

# Score the model
print("Accuracy score (training): {0:.3f}".format(
    model.score(
        X_train_scaled,
        y_train)))
print("Accuracy score (validation): {0:.3f}".format(
    model.score(
        X_test_scaled,
        y_test)))

Accuracy score (training): 1.000
Accuracy score (validation): 0.848


In [11]:
# Make predictions
predictions = model.predict(X_test_scaled)

# Generate accuracy score for predictions using y_test
accuracy_score(y_test, predictions)

0.8476190476190476

 ### Model Evaluation

In [12]:
# Generatring the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=[
        "Predicted 0",
        "Predicted 1"
    ]
)

display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,307,37
Actual 1,43,138


In [13]:
# Generate classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.88      0.89      0.88       344
           1       0.79      0.76      0.78       181

    accuracy                           0.85       525
   macro avg       0.83      0.83      0.83       525
weighted avg       0.85      0.85      0.85       525

