In [None]:
import pandas as pd        
import numpy as np


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split


#scores = cross_val_score(model, X, y, cv=5) # will give negative score in some folds, bacause the rows are ordered
#scores

In [None]:
file1 = pd.read_csv('numerical.csv')
file2 = pd.read_csv('categorical.csv')
file3 = pd.read_csv('target.csv')

In [None]:
pd.set_option('display.max_columns',None)                       # concate the three files
file1_file3 = pd.concat((file1, file2, file3),axis=1)
#file1_file3

In [None]:
# cost_data = file1_file3[file1_file3['TARGET_B']==1]
# X = donations_data.drop(columns=['TARGET_B','TARGET_D'])
# y = donations_data['TARGET_D']

# Splitting the Data

In [None]:
y = file1_file3[['TARGET_B', 'TARGET_D']]
X_num = file1_file3.drop(columns = ['TARGET_B', 'TARGET_D'])
X_cat = file2

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_num, y, test_size=0.2, random_state=42)

In [None]:
# split tthem more into numericals and categoricals
X_train_num = X_train.select_dtypes(include = np.number)
X_test_num  = X_test.select_dtypes(include = np.number)
X_train_cat = X_train.select_dtypes(include = object)
X_test_cat  = X_test.select_dtypes(include = object)

# Scaling the Data

In [None]:
# alternatively you could use StandardScaler:
from sklearn.preprocessing import StandardScaler
transformer = StandardScaler().fit(X_train_num)
X_train_num_scaled = transformer.transform(X_train_num)
print(X_train_num_scaled.shape)
X_train_num_scaled= pd.DataFrame(X_train_num_scaled,columns=X_train_num.columns)
X_train_num_scaled.head()
X_train_num_scaled.describe().T

In [None]:
# all features are numeric, so no need to split into _num and _cat
#transformer = StandardScaler().fit(X_train)
#X_train_scaled = pd.DataFrame(transformer.transform(X_train),columns=X.columns)
# because this is the only tranformation we apply to the numerics, 
# we can immediately transform the X_test as well
# X_test_num_scaled = pd.DataFrame(transformer.transform(X_test_num),columns=X_test_num.columns)
# X_test_num_scaled.head()


# applying scaler to X_test_num
X_test_scaled_arr = transformer.transform(X_test_num)
X_test_num_scaled = pd.DataFrame(X_test_scaled_arr, columns=X_test_num.columns)
X_test_num_scaled

# Encoding the Data

In [None]:
# encoding categorical data
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(X_train_cat)
X_train_encoded_cat = encoder.transform(X_train_cat).toarray()
cols = encoder.get_feature_names_out(input_features=X_train_cat.columns)

X_train_onehot_encoded = pd.DataFrame(X_train_encoded_cat, columns=cols)
X_train_onehot_encoded.head()

In [None]:
# applying encoder to X_test_cat
X_test_encoded_cat = encoder.transform(X_test_cat).toarray()
X_test_onehot_encoded_cat = pd.DataFrame(X_test_encoded_cat, columns=cols)
X_test_onehot_encoded_cat.head()

# Concatenating Scaled and Encoded Data

In [None]:
# joining transformed and encoded data togetther for prediction
X_train_treated = pd.concat([X_train_num_scaled, X_train_onehot_encoded], axis=1)
X_train_treated

In [None]:
# joining transformed and encoded data togetther for prediction
X_test_treated = pd.concat([X_test_num_scaled, X_test_onehot_encoded_cat], axis=1)
X_test_treated

# Running the Regressor

In [None]:
# from sklearn.linear_model import LogisticRegression
# classification = LogisticRegression(random_state=0, solver='lbfgs',
#                   multi_class='multinomial').fit(X_train_transformed, y_train)

from sklearn.linear_model import LogisticRegression

LogReg = LogisticRegression(random_state=0, solver='lbfgs')
LogReg .fit(X_train_treated, y_train['TARGET_B'])
LogReg .score(X_test_treated, y_test['TARGET_B'])

In [None]:
#while accuracy is not absolutely terrible, a closer look reveals some serious problems
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

pred = LogReg.predict(X_test_treated)

print("accuracy:"   ,accuracy_score(y_test['TARGET_B'], pred))
print("precision: ",precision_score(y_test['TARGET_B'],pred, pos_label= 0))
print("recall: ",recall_score(y_test['TARGET_B'],pred, pos_label= 0))
print("f1: ",f1_score(y_test['TARGET_B'],pred, pos_label= 0))


In [None]:
#we a certain percentage of churn is unidentifiable
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test['TARGET_B'],pred)

# Handling Imbalance in the Data

In [None]:
# while there are more imbalanced datasets, we have a significant imbalance
# and the cost of failing to detect the minority class is quite high (lack of diagnosis of diabetes)
count_classes = y_train['TARGET_B'].value_counts()
count_classes
count_classes.plot(kind = 'bar')

In [None]:
pd.set_option('display.max_columns',None)
y_train.reset_index(drop=True, inplace = True)
trainset = pd.concat((X_train_treated, y_train),axis=1)
trainset 

In [None]:
# y_train['TARGET_B'].value_counts()

In [None]:
# data_B = trainset.drop(['TARGET_D'], axis=1).copy()
# data_B.head()

In [None]:
from sklearn.utils import resample

category_0 = trainset[ trainset['TARGET_B'] == 0]
category_1 =  trainset[ trainset['TARGET_B'] == 1]

In [None]:
category_1_oversampled = resample(category_1,
                                  replace=True,
                                  n_samples = len(category_0))

In [None]:
print(category_0.shape)
print(category_1_oversampled.shape)

In [None]:
data_upsampled = pd.concat([category_0, category_1_oversampled], axis=0)

In [None]:
data_upsampled 

In [None]:
X_train_1 = data_upsampled.drop(['TARGET_B', 'TARGET_D'], axis = 1) 

In [None]:
y_train_1 = data_upsampled['TARGET_B'] 

# Run the Regressor Again

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression

# LogReg = LogisticRegression(random_state=0, solver='lbfgs')
# LogReg .fit(X_train_1, y_train['TARGET_B'])
# LogReg .score(X_test_treated, y_test['TARGET_B'])


# Our Logistic Regression, while still not amazing, has improved substantially!
# especially at detecting instances of diabetes
from sklearn.linear_model import LogisticRegression

LogReg = LogisticRegression(random_state=0, solver='lbfgs')
LogReg .fit(X_train_1, y_train_1)
LogReg .score(X_test_treated, y_test['TARGET_B'])

LR_over = LogisticRegression(random_state=0, solver='lbfgs')
LR_over.fit(X_train_1, y_train_1)
pred = LR_over.predict(X_test_treated)

pred = LogReg.predict(X_test_treated)

print("accuracy:"   ,accuracy_score(y_test['TARGET_B'], pred))
print("precision: ",precision_score(y_test['TARGET_B'],pred, pos_label= 0))
print("recall: ",recall_score(y_test['TARGET_B'],pred, pos_label= 0))
print("f1: ",f1_score(y_test['TARGET_B'],pred, pos_label= 0))

# Applying the model(Random Forest Regretion) to predict Donations

In [None]:
# the application of the SMOTE algorithm improves the model a bit with the prediction moving from 257 to 431; an increase 
# of 174 True positives
from sklearn.linear_model import LogisticRegression

LogReg = LogisticRegression(random_state=0, solver='lbfgs')
LogReg.fit(X_train_SMOTE, y_train_SMOTE)
pred = LogReg.predict(X_test_scaled)
print("accuracy:",accuracy_score(y_test, pred))
print("precision: ",precision_score(y_test,pred, pos_label='Yes'))
print("recall: ",recall_score(y_test,pred, pos_label='Yes'))
print("f1: ",f1_score(y_test,pred, pos_label='Yes'))

# Applying the model(Random Forest Regretion) to predict Donations

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [None]:
regr = DecisionTreeRegressor(max_depth=5)
model = regr.fit(X_test_treated, data_upsampled)

print("test data R2 score was: ",regr.score(X_test_treated, y_test))
print("train data R2 score was: ",regr.score(X_train_treated, data_upsampled))