In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
targets = pd.read_csv('target.csv')
data = pd.concat([numerical, categorical, targets], axis = 1)
data['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

In [2]:
data.isna().sum().sum()


0

In [3]:
data.shape


(95412, 339)

In [4]:
data = data.drop(['TARGET_D'],axis=1)


In [5]:
# X-y

y = data['TARGET_B']
X = data.drop(['TARGET_B'],axis=1)

numericalX = X.select_dtypes(np.number).reset_index(drop=True)

categoricalX = X.select_dtypes(object).reset_index(drop=True)

In [6]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)

In [7]:
X = pd.concat([numericalX, encoded_categorical], axis = 1)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [9]:
traindata = pd.concat([X_train,y_train], axis=1 ).reset_index(drop=True)

no_donate = traindata[traindata['TARGET_B']==0]
yes_donate = traindata[traindata['TARGET_B']==1]

from sklearn.utils import resample
yes_donate_upsampled = resample(yes_donate, 
                                    replace=True,
                                    n_samples = len(no_donate),
                                    random_state=42)


display(no_donate.shape)
display(yes_donate_upsampled.shape)

(72486, 355)

(72486, 355)

In [10]:
upsampled = pd.concat([no_donate,yes_donate_upsampled], axis=0)

upsampled = upsampled.reset_index(drop=True)

In [11]:
y_train = upsampled['TARGET_B']
X_train = upsampled.drop(['TARGET_B'],axis=1)

In [12]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.6245274949645448
0.6025258083110622


In [13]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.6254863008029137
0.6055127600482104


In [15]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
pred = clf.predict(X_test)

print("accuracy: ",accuracy_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

accuracy:  0.6055127600482104
recall:  0.555
f1:  0.1285019680481593


In [16]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test,pred)

array([[11000,  7083],
       [  445,   555]])

In [17]:
#  548 we predicted that they are the donors and indeed they are.. 
#   452 we predicted that they are not donors but actually they are.. 
#    7175 we predicted that they are donors but actally they are not..

# so we need to progress the model in order to predict donors. Indeed they are donors and in our model also they are donors.

In [22]:
donations_positive = 548 * 15.62
donations_got_wasted = 452 * 15.62
spent_on_marketing = (7175+548)*0.68
wasted_on_marketing = 7175*0.68

print('Amount that donation gained: ', round(donations_positive,2))
print('Amount that our prediction was wrong: ', round(donations_got_wasted,2))
print('Amount that donation spend on marketing: ', round(spent_on_marketing,2))
print('Amount that donation spend on marketing but we predict wrong: ', round(wasted_on_marketing,2))

Amount that donation gained:  8559.76
Amount that our prediction was wrong:  7060.24
Amount that donation spend on marketing:  5251.64
Amount that donation spend on marketing but we predict wrong:  4879.0
