In [1]:
# Import the pandas library
import pandas as pd

# Import the numpy library
import numpy as np

# Set the option to display all columns in pandas
pd.set_option('display.max_columns', None)

# Import the warnings library
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')


# Read in the numerical, categorical, and target data from their respective CSV files
numerical_data = pd.read_csv('numerical.csv')
categorical_data = pd.read_csv('categorical.csv')
targets = pd.read_csv('target.csv')

# Combine the numerical and categorical data into one dataframe
data = pd.concat([numerical_data, categorical_data, targets], axis=1)

# Get the value counts for the 'TARGET_B' column
value_counts = data['TARGET_B'].value_counts()

# Print the value counts
print(value_counts)

0    90569
1     4843
Name: TARGET_B, dtype: int64


In [2]:
# Check for missing values in the data
missing_values = data.isna().sum().sum()
print("Number of missing values: ", missing_values)

Number of missing values:  0


In [3]:
# Get the shape of the data
shape = data.shape
print("Shape of the data: ", shape)

Shape of the data:  (95412, 339)


In [4]:
# Drop the 'TARGET_D' column from the data
data = data.drop(['TARGET_D'], axis=1)

In [5]:
# Split the data into the target ('y') and features ('X')
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis=1)

# Select the numerical features
numerical_features = X.select_dtypes(np.number).reset_index(drop=True)

# Select the categorical features
categorical_features = X.select_dtypes(object).reset_index(drop=True)

In [6]:
# One-hot encode the categorical features
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(categorical_features)
encoded_categorical = encoder.transform(categorical_features).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)

In [7]:
# Concatenate the numerical and one-hot encoded categorical features
X = pd.concat([numerical_features, encoded_categorical], axis=1)

In [8]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [9]:
# Concatenate the features and target in the training set
train_data = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)

# Separate the 'no donation' and 'yes donation' cases in the training set
no_donation = train_data[train_data['TARGET_B'] == 0]
yes_donation = train_data[train_data['TARGET_B'] == 1]

# Upsample the 'yes donation' cases to match the number of 'no donation' cases
from sklearn.utils import resample
yes_donation_upsampled = resample(yes_donation, 
                                    replace=True,
                                    n_samples = len(no_donation),
                                    random_state=42)

In [10]:
# Concatenate the upsampled 'yes donation' cases and the 'no donation' cases
upsampled = pd.concat([no_donation, yes_donation_upsampled], axis=0)
upsampled = upsampled.reset_index(drop=True)

In [11]:
# Update the training set with the upsampled data
y_train = upsampled['TARGET_B']
X_train = upsampled.drop(['TARGET_B'], axis=1)

In [12]:
# Train a random forest classifier on the upsampled training set
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
clf.fit(X_train, y_train)

# Evaluate the classifier on the training and testing sets
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
print("Training set accuracy: ", train_score)
print("Testing set accuracy: ", test_score)

Training set accuracy:  0.6220856441243826
Testing set accuracy:  0.6046219147932715


In [13]:
# Importing Required Libraries
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Predict the target variable using the test data
pred = clf.predict(X_test)

# Calculate the accuracy score
print("Accuracy: ", accuracy_score(y_test, pred))

# Calculate the recall score
print("Recall: ", recall_score(y_test, pred))

# Calculate the f1 score
print("F1: ", f1_score(y_test, pred))

Accuracy:  0.6046219147932715
Recall:  0.548
F1:  0.12683717162365468


In [14]:
# Import the confusion matrix function
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
print(confusion_matrix(y_test, pred))

[[10990  7093]
 [  452   548]]


In [15]:
# Calculate the amount gained through successful donations
donations_positive = 548 * 15.62

# Calculate the amount wasted due to incorrect predictions
donations_got_wasted = 452 * 15.62

# Calculate the amount spent on marketing, including both successful and unsuccessful campaigns
spent_on_marketing = (7175 + 548) * 0.68

# Calculate the amount spent on marketing for unsuccessful campaigns
wasted_on_marketing = 7175 * 0.68

# Print the results of the financial calculations
print('Amount gained through successful donations: ', round(donations_positive, 2))
print('Amount wasted due to incorrect predictions: ', round(donations_got_wasted, 2))
print('Amount spent on marketing, including both successful and unsuccessful campaigns: ', round(spent_on_marketing, 2))
print('Amount spent on marketing for unsuccessful campaigns: ', round(wasted_on_marketing, 2))

Amount gained through successful donations:  8559.76
Amount wasted due to incorrect predictions:  7060.24
Amount spent on marketing, including both successful and unsuccessful campaigns:  5251.64
Amount spent on marketing for unsuccessful campaigns:  4879.0


# regression 

In [16]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Set display options to display all columns
pd.set_option('display.max_columns', None)

# Supress warnings
import warnings
warnings.filterwarnings('ignore')

# Load numerical and categorical data as well as target data into separate dataframes
numerical_data = pd.read_csv('numerical.csv')
categorical_data = pd.read_csv('categorical.csv')
target = pd.read_csv('target.csv')

# Concatenate the three dataframes along the columns axis to form a complete dataframe
data = pd.concat([numerical_data, categorical_data, target], axis=1)

# Get the value counts for the "TARGET_B" column
data['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

In [17]:
# Get only the rows where TARGET_B == 1
target_d = data[data["TARGET_B"] == 1]

In [18]:
# Split target column and feature data into separate variables
y = target_d['TARGET_D']
X = target_d.drop(['TARGET_B', 'TARGET_D'], axis=1)

# Split the feature data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [19]:
# Separate numerical and categorical data in the training set
X_train_num = X_train.select_dtypes(include=np.number)
X_train_cat = X_train.select_dtypes(object)

# Separate numerical and categorical data in the testing set
X_test_num = X_test.select_dtypes(include=np.number)
X_test_cat = X_test.select_dtypes(object)

In [20]:
# Scale the numerical data in the training and testing sets
from sklearn.preprocessing import MinMaxScaler

# Fit MinMaxScaler to training data and transform the training data
scaler = MinMaxScaler().fit(X_train_num)
X_scaled_train = pd.DataFrame(scaler.transform(X_train_num), columns=X_train_num.columns)

# Fit MinMaxScaler to testing data and transform the testing data
scaler2 = MinMaxScaler().fit(X_test_num)
X_scaled_test = pd.DataFrame(scaler2.transform(X_test_num), columns=X_test_num.columns)

X_scaled_test.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,FEDGOV,WEALTH2,POP901,POP902,POP903,POP90C1,POP90C2,POP90C3,POP90C4,POP90C5,ETH1,ETH2,ETH3,ETH4,ETH5,ETH6,ETH7,ETH8,ETH9,ETH10,ETH11,ETH12,ETH13,ETH14,ETH15,ETH16,AGE901,AGE902,AGE903,AGE904,AGE905,AGE906,AGE907,CHIL1,CHIL2,CHIL3,AGEC1,AGEC2,AGEC3,AGEC4,AGEC5,AGEC6,AGEC7,CHILC1,CHILC2,CHILC3,CHILC4,CHILC5,HHAGE1,HHAGE2,HHAGE3,HHN1,HHN2,HHN3,HHN4,HHN5,HHN6,MARR1,MARR2,MARR3,MARR4,HHP1,HHP2,DW1,DW2,DW3,DW4,DW5,DW6,DW7,DW8,DW9,HV1,HV2,HV3,HV4,HU1,HU2,HU3,HU4,HU5,HHD1,HHD2,HHD3,HHD4,HHD5,HHD6,HHD7,HHD8,HHD9,HHD10,HHD11,HHD12,ETHC1,ETHC2,ETHC3,ETHC4,ETHC5,ETHC6,HVP1,HVP2,HVP3,HVP4,HVP5,HVP6,HUR1,HUR2,RHP1,RHP2,RHP3,RHP4,HUPA1,HUPA2,HUPA3,HUPA4,HUPA5,HUPA6,HUPA7,RP1,RP2,RP3,RP4,MSA,ADI,DMA,IC1,IC2,IC3,IC4,IC5,IC6,IC7,IC8,IC9,IC10,IC11,IC12,IC13,IC14,IC15,IC16,IC17,IC18,IC19,IC20,IC21,IC22,IC23,HHAS1,HHAS2,HHAS3,HHAS4,MC1,MC2,MC3,TPE1,TPE2,TPE3,TPE4,TPE5,TPE6,TPE7,TPE8,TPE9,PEC1,PEC2,TPE10,TPE11,TPE12,TPE13,LFC1,LFC2,LFC3,LFC4,LFC5,LFC6,LFC7,LFC8,LFC9,LFC10,OCC1,OCC2,OCC3,OCC4,OCC5,OCC6,OCC7,OCC8,OCC9,OCC10,OCC11,OCC12,OCC13,EIC1,EIC2,EIC3,EIC4,EIC5,EIC6,EIC7,EIC8,EIC9,EIC10,EIC11,EIC12,EIC13,EIC14,EIC15,EIC16,OEDC1,OEDC2,OEDC3,OEDC4,OEDC5,OEDC6,OEDC7,EC1,EC2,EC3,EC4,EC5,EC6,EC7,EC8,SEC1,SEC2,SEC3,SEC4,SEC5,AFC1,AFC2,AFC3,AFC4,AFC5,AFC6,VC1,VC2,VC3,VC4,ANC1,ANC2,ANC3,ANC4,ANC5,ANC6,ANC7,ANC8,ANC9,ANC10,ANC11,ANC12,ANC13,ANC14,ANC15,POBC1,POBC2,LSC1,LSC2,LSC3,LSC4,VOC1,VOC2,VOC3,HC1,HC2,HC3,HC4,HC5,HC6,HC7,HC8,HC9,HC10,HC11,HC12,HC13,HC14,HC15,HC16,HC17,HC18,HC19,HC20,HC21,MHUC1,MHUC2,AC1,AC2,CARDPROM,NUMPROM,CARDPM12,NUMPRM12,RAMNTALL,NGIFTALL,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,CLUSTER,DATASRCE,DOMAIN_B,ODATEW_YR,ODATEW_MM,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,0.0,0.659574,0.5,1.0,0.0,0.02439,0.328571,0.363636,0.333333,0.058824,0.191489,0.135135,1.0,0.107244,0.105146,0.098077,1.0,0.0,0.0,0.484848,0.8,0.383838,0.010101,0.0,0.666667,0.032967,0.0,0.421053,0.166667,0.172414,0.173913,0.3,0.142857,0.011364,0.027027,0.0,0.042553,0.493333,0.533333,0.56,0.540541,0.594595,0.635135,0.169014,0.444444,0.353535,0.222222,0.133333,0.262626,0.232323,0.411765,0.423077,0.211538,0.103448,0.191919,0.62963,0.5,0.32,0.181818,0.221053,0.16,0.204301,0.443182,0.56338,0.212121,0.090909,0.030303,0.010101,0.434343,0.171717,0.171429,0.404762,0.195385,0.278571,0.060606,0.040404,0.036364,0.959184,0.938776,0.90625,0.0,0.0,0.0,0.512207,0.551997,0.461538,0.5,0.434343,0.575758,0.939394,0.071429,0.090909,0.161616,0.484848,0.353535,0.10101,0.646465,0.363636,0.162162,0.1,0.114286,0.595745,0.574074,0.317073,0.028169,0.272727,0.09375,0.0,0.015873,0.0,0.787879,0.939394,0.979798,0.979798,1.0,0.515152,0.447368,0.040404,0.373494,0.369048,0.180328,0.5,0.192308,0.822917,0.0,0.030303,0.117647,0.489583,0.0,0.717172,0.868687,0.939394,0.969697,0.358145,0.0,0.844495,0.224667,0.286,0.327843,0.394696,0.221909,0.170732,0.320755,0.477273,0.434783,0.34,0.166667,0.136364,0.095238,0.019231,0.136364,0.121212,0.4,0.262295,0.5,0.263158,0.166667,0.136364,0.017544,0.217391,0.090909,0.556701,0.101449,0.626263,0.426966,0.280488,0.59596,0.253731,0.309524,0.315789,0.0,0.125,0.153846,0.236842,0.032258,0.0,0.011364,0.188889,0.377358,0.04,0.606061,0.776596,0.767677,0.806818,0.767677,0.804598,0.808081,0.676768,0.959596,1.0,0.010101,0.425926,0.32,0.307692,0.179104,0.25,0.0,0.058824,0.25,0.022222,0.171429,0.058824,0.06,0.142857,0.020833,0.0,0.065217,0.095238,0.133333,0.095238,0.090909,0.301887,0.272727,0.333333,0.25,0.057143,0.261905,0.2,0.235294,0.216216,0.055556,0.191489,0.135135,0.075472,0.707071,0.225,0.071429,0.823529,0.132075,0.113636,0.313433,0.522727,0.258065,0.458333,0.241379,0.189189,0.231884,0.111111,0.216216,0.185714,0.0,0.012048,0.0,0.333333,0.298701,0.125,0.363636,0.272727,0.333333,0.121212,0.0,0.057143,0.0,0.065574,0.0,0.142857,0.083333,0.043478,0.0,0.0,0.055556,0.0,0.166667,0.0625,0.0,0.278481,0.427083,0.717172,0.010526,0.378378,0.0,0.888889,0.244898,0.079365,0.166667,0.45098,0.012658,0.020202,0.050505,0.434343,0.868687,0.141414,0.48,0.807692,0.030303,0.013158,0.414141,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.571429,0.4,0.411765,0.08,0.061224,0.047337,0.1875,0.109091,0.002288,0.0,0.02439,0.377358,0.109489,0.2,0.116883,0.22687,0.911215,0.0,0.0,0.42623,0.0,0.5,0.0,0.909091,0.0,0.344086,0.0,0.909091,0.0,0.928571,0.0,0.5,0.0,1.0,0.0
1,0.0,0.61289,0.166667,1.0,0.0,0.0,0.642857,0.262626,0.525253,0.323529,0.085106,0.054054,1.0,0.143973,0.167439,0.112278,0.79798,0.0,0.212121,0.494949,0.784615,0.979798,0.0,0.011111,0.0,0.054945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.0,0.021277,0.573333,0.706667,0.746667,0.581081,0.702703,0.743243,0.28169,0.414141,0.40404,0.191919,0.106667,0.161616,0.141414,0.352941,0.615385,0.384615,0.241379,0.171717,0.62963,0.55,0.42,0.121212,0.431579,0.3,0.419355,0.318182,0.619718,0.282828,0.151515,0.070707,0.020202,0.646465,0.131313,0.285714,0.154762,0.230769,0.322857,0.212121,0.20202,0.018182,0.071429,0.05102,0.041667,0.010101,0.0,0.010101,0.102508,0.121524,0.230769,0.25,0.79798,0.212121,0.656566,0.357143,0.646465,0.232323,0.676768,0.575758,0.171717,0.79798,0.212121,0.162162,0.2,0.142857,0.361702,0.37037,0.121951,0.239437,0.474747,0.34375,0.0,0.0,0.0,0.010101,0.040404,0.181818,0.353535,0.666667,0.0,0.236842,0.141414,0.457831,0.464286,0.196721,0.333333,0.051282,0.03125,0.75,0.040404,0.039216,0.041667,0.238095,0.060606,0.171717,0.484848,0.737374,0.668824,0.426357,0.854711,0.122,0.157333,0.177255,0.211388,0.102253,0.512195,0.396226,0.409091,0.282609,0.1,0.027778,0.0,0.0,0.0,0.439394,0.232323,0.42,0.278689,0.16,0.026316,0.041667,0.0,0.0,0.478261,0.136364,0.309278,0.275362,0.575758,0.483146,0.268293,0.757576,0.283582,0.0,0.0,0.0,0.0,0.153846,0.105263,0.032258,0.0,0.772727,0.322222,0.622642,0.26,0.616162,0.553191,0.585859,0.522727,0.565657,0.482759,0.636364,0.474747,0.808081,1.0,0.050505,0.12963,0.1,0.230769,0.149254,0.205882,0.0,0.294118,0.326923,0.044444,0.542857,0.147059,0.12,0.357143,0.041667,0.030303,0.23913,0.206349,0.116667,0.047619,0.121212,0.377358,0.151515,0.333333,0.15625,0.085714,0.214286,0.1,0.117647,0.189189,0.138889,0.085106,0.081081,0.09434,0.818182,0.05,0.0,0.705882,0.245283,0.477273,0.537313,0.5,0.129032,0.041667,0.034483,0.027027,0.26087,0.111111,0.378378,0.057143,0.0,0.0,0.0,0.666667,0.61039,0.0625,0.292929,0.30303,0.383838,0.090909,0.033333,0.142857,0.153846,0.213115,0.0,0.0,0.25,0.086957,0.051282,0.045455,0.0,0.0,0.166667,0.0625,0.0,0.037975,0.21875,0.959596,0.031579,0.0,0.025,0.909091,0.510204,0.301587,0.166667,0.254902,0.012658,0.222222,0.434343,0.79798,0.949495,0.060606,0.0,0.038462,0.252525,0.289474,0.505051,0.0,0.142857,0.013889,0.888889,0.090909,0.131313,1.0,0.89899,0.333333,0.4,0.529412,0.18,0.591837,0.426036,0.25,0.109091,0.058581,0.215909,0.268293,0.056604,0.072993,0.15,0.064935,0.058959,0.711968,0.0,0.0,0.885246,0.865385,0.5,0.333333,0.0,0.0,0.0,0.090909,0.090909,0.454545,0.857143,0.272727,0.0,0.272727,0.52381,0.909091
2,0.000998,0.489362,0.5,0.111111,0.025,0.0,0.8,0.292929,0.535354,0.558824,0.042553,0.0,1.0,0.01796,0.020351,0.012907,0.0,0.757576,0.252525,0.474747,0.815385,1.0,0.0,0.0,0.0,0.010989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011364,0.0,0.0,0.0,0.493333,0.613333,0.653333,0.527027,0.662162,0.702703,0.380282,0.333333,0.434343,0.252525,0.106667,0.171717,0.222222,0.352941,0.538462,0.326923,0.155172,0.121212,0.518519,0.533333,0.54,0.151515,0.368421,0.3,0.354839,0.318182,0.422535,0.424242,0.242424,0.090909,0.020202,0.656566,0.111111,0.257143,0.178571,0.264615,0.354286,0.888889,0.888889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039632,0.046014,0.153846,0.083333,0.808081,0.20202,0.787879,0.22449,0.0,0.343434,0.707071,0.616162,0.282828,0.868687,0.141414,0.162162,0.2,0.114286,0.234043,0.425926,0.02439,0.309859,0.535354,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.010101,0.111111,0.0,0.013158,0.393939,0.60241,0.630952,0.229508,0.333333,0.0,0.0,0.114583,0.171717,0.0,0.0,0.071429,0.0,0.010101,0.040404,0.272727,0.0,0.665116,0.684449,0.129333,0.148,0.17098,0.203588,0.096877,0.5,0.603774,0.272727,0.217391,0.06,0.055556,0.0,0.0,0.0,0.439394,0.363636,0.32,0.180328,0.1,0.078947,0.0,0.0,0.0,0.5,0.022727,0.350515,0.101449,0.323232,0.764045,0.085366,0.737374,0.134328,0.0,0.0,0.0,0.0,0.0,0.263158,0.225806,0.0,0.318182,0.088889,0.245283,0.0,0.363636,0.595745,0.676768,0.534091,0.636364,0.528736,0.535354,0.444444,0.252525,1.0,0.111111,0.240741,0.14,0.0,0.119403,0.279412,0.0,0.0,0.211538,0.044444,0.228571,0.676471,0.06,0.428571,0.041667,0.060606,0.065217,0.68254,0.0,0.0,0.060606,0.207547,0.151515,0.111111,0.0,0.0,0.190476,0.26,0.205882,0.027027,0.527778,0.042553,0.0,0.245283,0.616162,0.15,0.0,0.705882,0.150943,0.409091,0.567164,0.386364,0.193548,0.145833,0.103448,0.027027,0.304348,0.111111,0.405405,0.1,0.0,0.0,0.0,0.722222,0.727273,0.0,0.292929,0.232323,0.535354,0.111111,0.0,0.285714,0.076923,0.163934,0.0,0.0,0.125,0.0,0.0,0.022727,0.0,0.0,0.166667,0.0625,0.0,0.0,0.833333,0.989899,0.010526,0.0,0.025,0.979798,0.673469,0.238095,0.458333,0.784314,0.0,0.0,0.050505,0.232323,0.333333,0.676768,0.0,0.0,0.878788,0.026316,0.030303,0.0,0.0,0.097222,0.979798,0.020202,0.79798,1.0,0.949495,0.238095,0.4,0.411765,0.12,0.122449,0.094675,0.1875,0.127273,0.004577,0.011364,0.0,0.09434,0.109489,0.2,0.155844,0.128868,0.550699,1.0,0.0,0.95082,0.846154,1.0,0.333333,0.818182,0.0,0.516129,0.0,0.727273,1.0,0.857143,1.0,0.0,1.0,0.904762,1.0
3,0.027944,0.840426,0.333333,0.888889,0.054167,0.0,0.857143,0.070707,0.757576,0.294118,0.06383,0.081081,0.777778,0.051955,0.083802,0.046413,0.171717,0.0,0.838384,0.484848,0.8,1.0,0.0,0.0,0.0,0.010989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.853333,0.866667,0.866667,0.797297,0.837838,0.851351,0.084507,0.444444,0.353535,0.212121,0.026667,0.050505,0.070707,0.235294,0.884615,0.807692,0.241379,0.191919,0.703704,0.383333,0.46,0.161616,0.684211,0.22,0.666667,0.204545,1.0,0.111111,0.050505,0.010101,0.0,0.818182,0.050505,0.228571,0.071429,0.224615,0.285714,0.707071,0.474747,0.036364,0.234694,0.214286,0.1875,0.0,0.0,0.0,0.205017,0.221642,0.461538,0.5,0.909091,0.10101,0.636364,0.377551,0.757576,0.080808,0.818182,0.767677,0.060606,0.89899,0.111111,0.054054,0.0,0.057143,0.170213,0.259259,0.04878,0.070423,0.272727,0.697917,0.0,0.0,0.0,0.10101,0.242424,0.717172,0.848485,0.989899,0.020202,0.026316,0.30303,0.566265,0.607143,0.196721,0.166667,0.128205,0.135417,0.072917,0.050505,0.019608,0.020833,0.02381,0.69697,0.838384,0.848485,0.888889,0.81014,1.0,0.611805,0.236,0.277333,0.321569,0.361934,0.226653,0.182927,0.339623,0.363636,0.413043,0.44,0.138889,0.136364,0.047619,0.019231,0.106061,0.161616,0.32,0.377049,0.56,0.157895,0.166667,0.045455,0.017544,0.793478,0.045455,0.773196,0.028986,0.686869,0.359551,0.463415,0.868687,0.104478,0.0,0.0,0.0,0.0,0.0,0.078947,0.129032,0.067568,0.113636,0.211111,0.396226,0.02,0.757576,0.308511,0.323232,0.295455,0.292929,0.287356,0.515152,0.484848,0.0,0.535354,0.090909,0.203704,0.4,0.615385,0.328358,0.147059,0.0,0.294118,0.25,0.022222,0.057143,0.088235,0.02,0.214286,0.020833,0.0,0.043478,0.047619,0.066667,0.190476,0.030303,0.528302,0.424242,0.222222,0.1875,0.114286,0.309524,0.22,0.058824,0.081081,0.277778,0.06383,0.081081,0.056604,0.808081,0.05,0.0,0.729412,0.037736,0.227273,0.522388,0.568182,0.193548,0.25,0.172414,0.054054,0.043478,0.111111,0.081081,0.028571,0.0,0.0,0.0,0.805556,0.779221,0.1875,0.070707,0.242424,0.757576,0.0,0.033333,0.2,0.076923,0.213115,0.04,0.142857,0.25,0.217391,0.0,0.068182,0.0,0.055556,0.333333,0.0625,0.2,0.050633,0.041667,0.979798,0.0,0.0,0.075,0.989899,0.387755,0.079365,0.125,0.117647,0.177215,0.535354,0.757576,0.959596,1.0,0.010101,0.213333,0.115385,0.0,0.065789,0.929293,0.011364,0.0,0.0,0.959596,0.040404,0.949495,1.0,1.0,0.380952,0.6,0.529412,0.34,0.55102,0.337278,0.25,0.109091,0.029748,0.193182,0.317073,0.037736,0.007299,0.05,0.337662,0.023606,0.231497,0.0,0.666667,0.196721,0.538462,1.0,0.333333,0.0,0.0,0.16129,0.090909,0.181818,1.0,0.642857,0.909091,0.5,0.181818,0.52381,0.818182
4,1.0,0.553191,0.666667,1.0,0.0,0.0,0.442857,0.282828,0.323232,0.117647,0.085106,0.027027,1.0,0.119798,0.135672,0.080658,0.0,0.0,1.0,0.484848,0.8,0.989899,0.010101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.466667,0.573333,0.613333,0.5,0.621622,0.675676,0.352113,0.353535,0.414141,0.252525,0.16,0.20202,0.20202,0.470588,0.5,0.211538,0.155172,0.131313,0.518519,0.55,0.42,0.181818,0.284211,0.26,0.268817,0.261364,0.450704,0.454545,0.262626,0.090909,0.020202,0.626263,0.090909,0.285714,0.22619,0.284615,0.371429,0.757576,0.747475,0.036364,0.091837,0.071429,0.041667,0.020202,0.046512,0.0,0.074247,0.082589,0.153846,0.166667,0.787879,0.222222,0.919192,0.091837,0.030303,0.373737,0.757576,0.646465,0.30303,0.878788,0.131313,0.189189,0.1,0.171429,0.234043,0.333333,0.04878,0.28169,0.585859,0.197917,0.0,0.015873,0.0,0.0,0.010101,0.050505,0.141414,0.424242,0.0,0.052632,0.434343,0.626506,0.630952,0.229508,0.333333,0.076923,0.03125,0.15625,0.111111,0.078431,0.041667,0.071429,0.0,0.020202,0.080808,0.393939,0.0,0.32093,0.736663,0.176667,0.216,0.238431,0.275351,0.12419,0.353659,0.358491,0.363636,0.456522,0.26,0.027778,0.045455,0.0,0.0,0.257576,0.191919,0.36,0.42623,0.32,0.052632,0.041667,0.0,0.0,0.347826,0.159091,0.381443,0.15942,0.353535,0.730337,0.073171,0.767677,0.19403,0.0,0.0,0.0,0.0,0.0,0.157895,0.16129,0.202703,0.318182,0.2,0.45283,0.14,0.545455,0.62766,0.757576,0.522727,0.666667,0.482759,0.575758,0.464646,0.727273,1.0,0.060606,0.166667,0.12,0.230769,0.119403,0.161765,0.066667,0.117647,0.25,0.133333,0.485714,0.382353,0.12,0.357143,0.145833,0.030303,0.130435,0.396825,0.083333,0.333333,0.090909,0.301887,0.151515,0.111111,0.09375,0.028571,0.190476,0.1,0.117647,0.081081,0.111111,0.085106,0.054054,0.188679,0.747475,0.15,0.0,0.705882,0.207547,0.363636,0.641791,0.340909,0.16129,0.125,0.051724,0.054054,0.304348,0.111111,0.513514,0.042857,0.0,0.0,0.0,0.444444,0.415584,0.0625,0.272727,0.232323,0.30303,0.121212,0.0,0.171429,0.076923,0.393443,0.0,0.0,0.208333,0.043478,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.583333,0.989899,0.0,0.0,0.025,0.919192,0.663265,0.380952,0.375,0.509804,0.012658,0.111111,0.191919,0.454545,0.555556,0.454545,0.0,0.0,0.565657,0.210526,0.161616,0.045455,0.0,0.111111,0.747475,0.252525,0.545455,1.0,0.929293,0.428571,0.4,0.352941,0.14,0.163265,0.118343,0.3125,0.163636,0.005492,0.056818,0.097561,0.056604,0.007299,0.04,0.064935,0.024332,0.334131,0.0,1.0,0.704918,0.846154,1.0,0.333333,0.818182,0.0,0.451613,0.0,0.818182,0.909091,0.857143,0.818182,0.5,0.181818,0.952381,0.0


In [21]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt

# Initialize OneHotEncoder with 'drop' parameter set to 'first' to avoid multicollinearity
encoder = OneHotEncoder(drop='first').fit(X_train_cat)

# Encode train data using OneHotEncoder
encoded_train = encoder.transform(X_train_cat).toarray()
encoded_train = pd.DataFrame(encoded_train)

# Encode test data using OneHotEncoder
encoded_test = encoder.transform(X_test_cat).toarray()
encoded_test = pd.DataFrame(encoded_test)

# Scale train data
train_scaled = pd.concat([encoded_train,X_scaled_train],axis=1)

# Scale test data
test_scaled = pd.concat([encoded_test,X_scaled_test],axis=1)

# Initialize DecisionTreeRegressor, LinearRegression and RandomForestRegressor models
DT = DecisionTreeRegressor()
LR = LinearRegression()
RF = RandomForestRegressor()

# Create a list of models
model_pipeline = [DT, LR, RF]

# Create a list of model names
model_names = ['Regression Tree', 'Linear Regression','Random Forest']

# Create a dictionary to store cross-validation mean scores of each model
scores = {}

# Iterate through models in the pipeline and store their cross-validation mean scores in the 'scores' dictionary
for i, model in enumerate(model_pipeline):
    mean_score = np.mean(cross_val_score(model, train_scaled, y_train, cv=10))
    scores[model_names[i]] = mean_score

# Print the cross-validation mean scores for each model
print("Cross-validation mean scores: ", scores)

# Create a dictionary to store validation scores of each model
val_scores = {}

# Iterate through models in the pipeline and store their validation scores in the 'val_scores' dictionary
for i, model in enumerate(model_pipeline):
    model.fit(train_scaled, y_train)
    val_scores[model_names[i]] = model.score(test_scaled,y_test)

# Print the validation scores for each model
print("Validation scores: ", val_scores)

# Fit the RandomForestRegressor model to the training data
LR =RandomForestRegressor().fit(train_scaled, y_train)

# Make predictions using the fitted model on the test data
pred = LR.predict(test_scaled)

# Calculate performance metrics for the predictions
r2 = r2_score(y_test,pred)
mse = mean_squared_error(y_test,pred)
mae = mean_absolute_error(y_test,pred)
rmse = np.sqrt(mse)

# Print the results
print('R2 Score:', r2)
print('Mean Squared Error:', mse)
print('Mean Absolute Error:', mae)
print('Root Mean Squared Error:', rmse)

# Calculate the average donation prediction
average_prediction = np.round(np.mean(pred), 2)
print('The average donation prediction is:', average_prediction)


Cross-validation mean scores:  {'Regression Tree': -0.08326034192472656, 'Linear Regression': 0.4117895634825753, 'Random Forest': 0.5183023510847171}
Validation scores:  {'Regression Tree': -18.115963297091824, 'Linear Regression': -1.8122276315051424, 'Random Forest': -9.58656427797558}
R2 Score: -10.040340870205988
Mean Squared Error: 2103.898954503117
Mean Absolute Error: 42.77806831785345
Root Mean Squared Error: 45.86827830323607
The average donation prediction is: 58.85
