# **Import Data**

## input file Path below 

In [1]:
data_name='data_new.csv'

In [2]:
import pandas as pd
import numpy as np

In [3]:
data_new = pd.read_csv(data_name)

In [4]:
# Define a function to categorize the values
def categorize(total):
    if total > 9999:
        return 'High'
    elif 1000 < total <= 9999:
        return 'Mid'
    else:
        return 'Low'

# Apply the function to create the new column
data_new['category'] = data_new['total_2022'].apply(categorize)

data_new

Unnamed: 0,ID,Constituency Code,location,total_gift,Total Gift Times,total_event,types_of_event,Avg_Events_Per_Year,Num_Different_Zips,Max Gift Amount Since 2019,Min Gift Amount Since 2019,Max Sub Event Attended,Max Major Event Attended,total_2022,Gift Amount_y_avg_10_years,total_event_10,County,category
0,1026027,Individual,Not Chicago,410.0,19,17,9,1.545455,1,410.0,410.0,event_general operating budget,direct_mail,0.0,0.0,0.0,Other US County,Low
1,1041282,Individual,Not Chicago,30.0,3,3,2,1.000000,1,30.0,30.0,event_direct mail,direct_mail,0.0,0.0,0.0,Other US County,Low
2,1041328,Individual,Not Chicago,106.0,14,11,6,1.375000,1,106.0,106.0,event_direct mail,direct_mail,0.0,0.0,0.0,Other US County,Low
3,1111161,Individual,Not Chicago,80.0,16,15,2,1.875000,1,80.0,80.0,event_direct mail,direct_mail,0.0,0.0,0.0,Other US County,Low
4,1113516,Individual,Not Chicago,151.0,35,35,6,2.333333,1,151.0,151.0,event_direct mail,direct_mail,0.0,3.0,1.0,Other US County,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40456,1274319,Individual,Not Chicago,660.0,1,0,0,0.000000,1,660.0,660.0,event_annual giving,annual_giving,0.0,0.0,0.0,Dupage County,Low
40457,1262892,Parent/Family/Guardian,Not Chicago,54.0,1,1,1,1.000000,1,54.0,54.0,event_newsletter,direct_mail,0.0,0.0,0.0,Cook County,Low
40458,1274321,Individual,Chicago,150.0,1,1,1,1.000000,1,150.0,150.0,event_annual giving,annual_giving,0.0,0.0,0.0,Cook County,Low
40459,1274320,Individual,Not Chicago,54.0,1,0,0,0.000000,0,54.0,54.0,event_annual giving,annual_giving,0.0,0.0,0.0,Other US County,Low


# Logistic Regression (balanced data+regularization)

In [5]:
data_new.dtypes

ID                              int64
Constituency Code              object
location                       object
total_gift                    float64
Total Gift Times                int64
total_event                     int64
types_of_event                  int64
Avg_Events_Per_Year           float64
Num_Different_Zips              int64
Max Gift Amount Since 2019    float64
Min Gift Amount Since 2019    float64
Max Sub Event Attended         object
Max Major Event Attended       object
total_2022                    float64
Gift Amount_y_avg_10_years    float64
total_event_10                float64
County                         object
category                       object
dtype: object

In [6]:
# imbalance: addressing imbalance issue by undersampling
low = data_new[data_new['category']=="Low"]
not_low = data_new[data_new['category']!="Low"]
sampled_data_new = low.sample(n=300, random_state=42)  # random_state ensures reproducibility
combined_data_new = pd.concat([sampled_data_new, not_low], ignore_index=True)
combined_data_new

Unnamed: 0,ID,Constituency Code,location,total_gift,Total Gift Times,total_event,types_of_event,Avg_Events_Per_Year,Num_Different_Zips,Max Gift Amount Since 2019,Min Gift Amount Since 2019,Max Sub Event Attended,Max Major Event Attended,total_2022,Gift Amount_y_avg_10_years,total_event_10,County,category
0,1123516,Individual,Not Chicago,20.00,2,2,2,2.000,1,20.00,20.00,event_newsletter,direct_mail,0.0,0.0,0.0,Other US County,Low
1,1240369,Trusts/Estates,Chicago,500.00,1,1,1,1.000,1,500.00,500.00,event_renewal,direct_mail,0.0,0.0,0.0,Other US County,Low
2,1084767,Individual,Not Chicago,2824.95,54,47,9,5.875,1,2824.95,2824.95,event_direct mail,direct_mail,0.0,0.0,0.0,Other US County,Low
3,1263681,Parent/Family/Guardian,Not Chicago,125.00,2,2,2,1.000,1,125.00,125.00,event_circle of love,direct_mail,0.0,62.5,2.0,Lake County,Low
4,1073149,Business/Corporation,Chicago,30.00,6,6,2,1.500,1,30.00,30.00,event_direct mail,direct_mail,0.0,0.0,0.0,Other US County,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
691,1273610,Individual,Not Chicago,3000.00,1,1,1,1.000,1,3000.00,3000.00,event_circle of love,direct_mail,3000.0,0.0,0.0,Other US County,Mid
692,1273612,Individual,Not Chicago,1074.30,1,1,1,1.000,1,1074.30,1074.30,event_annual giving,annual_giving,1074.3,0.0,0.0,Cook County,Mid
693,1273616,Individual,Not Chicago,1000.00,1,1,1,1.000,1,1000.00,1000.00,event_annual giving,annual_giving,2000.0,0.0,0.0,Cook County,Mid
694,1273616,Individual,Not Chicago,1000.00,1,1,1,1.000,1,1000.00,1000.00,event_annual giving,annual_giving,2000.0,0.0,0.0,Lake County,Mid


In [7]:
# one hot encoding for cat variables
categorical_columns = ['Constituency Code', 'location', 'County']

# Perform one-hot encoding
data_encoded_new = pd.get_dummies(combined_data_new, columns=categorical_columns)

In [8]:
from sklearn.preprocessing import StandardScaler
numeric_columns = [
    'total_gift', 'Total Gift Times', 'total_event', 'types_of_event',
    'Avg_Events_Per_Year', 'Num_Different_Zips', 'Max Gift Amount Since 2019',
    'Min Gift Amount Since 2019', 'total_2022', 'Gift Amount_y_avg_10_years', 'total_event_10'
]

# Assuming combined_data is your complete DataFrame, let's select only the numeric features this time
X_numeric = data_encoded_new[numeric_columns]

# Initialize the StandardScaler
scaler = StandardScaler()

# Standardize the numeric columns
data_encoded_new[numeric_columns] = scaler.fit_transform(data_encoded_new[numeric_columns])


# Model 1: No regularization

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score

# feature selection
y=data_encoded_new['category']
X = data_encoded_new[['total_gift', 'Total Gift Times', 'total_event', 'types_of_event',
       'Avg_Events_Per_Year', 'Num_Different_Zips',
       'Gift Amount_y_avg_10_years', 'total_event_10',
       'Constituency Code_Board of Directors',
       'Constituency Code_Business/Corporation',
       'Constituency Code_Foundation', 'Constituency Code_Government',
       'Constituency Code_Individual',
       'Constituency Code_Organization/Institution',
       'Constituency Code_Parent/Family/Guardian',
       'Constituency Code_Staff',
       'Constituency Code_Trusts/Estates', 'Constituency Code_Vendor',
       'Constituency Code_Volunteer', 'location_Not Chicago','location_Chicago', 'County_Cook County',
       'County_Dupage County', 'County_Kane County', 'County_Lake County', 'County_Mchenry County', 
       'County_Other Illinois County', 'County_Other US County']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)

0.8142857142857143

# Model 2: LASSO

In [10]:
# feature selection
model1 = LogisticRegression(penalty='l1', solver='liblinear', C=0.1)
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)

accuracy_score(y_test, y_pred)

0.7571428571428571

In [11]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.01, 0.1, 1, 10, 20, 50, 100]}

grid_search_l1 = GridSearchCV(model1, param_grid, cv=5, scoring='accuracy')
grid_search_l1.fit(X_train, y_train)



# Model 3: Ridge

In [12]:
model2 = LogisticRegression(penalty='l2', solver='liblinear', C=0.1)
model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)

accuracy_score(y_test, y_pred)

0.7928571428571428

In [13]:
param_grid = {'C': [0.01, 0.1, 1, 10, 20, 50, 100]}

grid_search_l2 = GridSearchCV(model2, param_grid, cv=5, scoring='accuracy')
grid_search_l2.fit(X_train, y_train)

# Fine-tuned models:

In [14]:
# Get the best hyperparameters
best_C_l1 = grid_search_l1.best_params_['C']
best_C_l2 = grid_search_l2.best_params_['C']

# Train the models with the best hyperparameters
final_l1_model = LogisticRegression(penalty='l1', solver='liblinear', C=best_C_l1)
final_l1_model.fit(X_train, y_train)

final_l2_model = LogisticRegression(penalty='l2', solver='liblinear', C=best_C_l2)
final_l2_model.fit(X_train, y_train)

# Evaluate the models on the test set
l1_accuracy = final_l1_model.score(X_test, y_test)
l2_accuracy = final_l2_model.score(X_test, y_test)

print("Accuracy with L1 regularization (best C={}): {:.4f}".format(best_C_l1, l1_accuracy))
print("Accuracy with L2 regularization (best C={}): {:.4f}".format(best_C_l2, l2_accuracy))

Accuracy with L1 regularization (best C=100): 0.8071
Accuracy with L2 regularization (best C=100): 0.8286




In [15]:
from sklearn.metrics import f1_score

# Evaluate the models on the test set
l1_pred = final_l1_model.predict(X_test)
l2_pred = final_l2_model.predict(X_test)

l1_accuracy = final_l1_model.score(X_test, y_test)
l2_accuracy = final_l2_model.score(X_test, y_test)

# Calculate F1 score
l1_f1_score = f1_score(y_test, l1_pred, average='weighted')
l2_f1_score = f1_score(y_test, l2_pred, average='weighted')

print("Accuracy with L1 regularization (best C={}): {:.4f}".format(best_C_l1, l1_accuracy))
print("F1 Score with L1 regularization (best C={}): {:.4f}".format(best_C_l1, l1_f1_score))
print("Accuracy with L2 regularization (best C={}): {:.4f}".format(best_C_l2, l2_accuracy))
print("F1 Score with L2 regularization (best C={}): {:.4f}".format(best_C_l2, l2_f1_score))

Accuracy with L1 regularization (best C=100): 0.8071
F1 Score with L1 regularization (best C=100): 0.7995
Accuracy with L2 regularization (best C=100): 0.8286
F1 Score with L2 regularization (best C=100): 0.8199


L2 model have higher accuracy and F1 score.

# Model Interpretation

In [16]:
#naming coefficients and intercept
coefficients = final_l2_model.coef_
intercept = final_l2_model.intercept_

In [17]:
# For level Low
feature_names = X.columns  
feature_importance = pd.DataFrame(coefficients[0], index=feature_names, columns=['Coefficient'])
feature_importance = feature_importance.sort_values(by='Coefficient', ascending=False)
print(feature_importance)

                                            Coefficient
total_gift                                     7.507262
County_Kane County                             1.807526
Constituency Code_Foundation                   1.766593
Constituency Code_Business/Corporation         1.713598
Constituency Code_Trusts/Estates               1.306078
Constituency Code_Parent/Family/Guardian       0.938500
Avg_Events_Per_Year                            0.902998
Num_Different_Zips                             0.808000
County_Dupage County                           0.585440
Total Gift Times                               0.499998
County_Lake County                             0.450126
Constituency Code_Government                   0.278171
County_Cook County                             0.147801
total_event_10                                 0.017777
Constituency Code_Volunteer                   -0.130716
types_of_event                                -0.207965
Constituency Code_Board of Directors          -0

In [18]:
# For level High
feature_names = X.columns  
feature_importance = pd.DataFrame(coefficients[1], index=feature_names, columns=['Coefficient'])
feature_importance = feature_importance.sort_values(by='Coefficient', ascending=False)
print(feature_importance)

                                            Coefficient
total_event                                    8.191689
County_Other Illinois County                   2.589969
Constituency Code_Individual                   0.655977
Constituency Code_Trusts/Estates               0.497848
Constituency Code_Business/Corporation         0.312901
types_of_event                                 0.144725
total_event_10                                 0.089705
Num_Different_Zips                             0.067407
Constituency Code_Volunteer                    0.037059
Constituency Code_Vendor                      -0.010972
County_Other US County                        -0.179456
Constituency Code_Parent/Family/Guardian      -0.214366
Constituency Code_Foundation                  -0.654592
Gift Amount_y_avg_10_years                    -0.985748
Constituency Code_Government                  -1.063075
Avg_Events_Per_Year                           -1.270957
Constituency Code_Staff                       -1

In [19]:
# For level Mid
feature_names = X.columns  
feature_importance = pd.DataFrame(coefficients[2], index=feature_names, columns=['Coefficient'])
feature_importance = feature_importance.sort_values(by='Coefficient', ascending=False)
print(feature_importance)

                                            Coefficient
Constituency Code_Organization/Institution     3.735403
Constituency Code_Vendor                       2.933211
Constituency Code_Staff                        2.344311
total_event                                    2.137617
County_Mchenry County                          1.595426
types_of_event                                 1.135837
County_Lake County                             0.952134
County_Dupage County                           0.875212
County_Cook County                             0.863344
location_Not Chicago                           0.522749
Gift Amount_y_avg_10_years                     0.358327
County_Kane County                             0.288551
location_Chicago                               0.094588
Num_Different_Zips                            -0.090445
Avg_Events_Per_Year                           -0.221512
total_event_10                                -0.236525
County_Other US County                        -0

Note that in python, each level will be treated as a binary classification problem. For example, "Low" level, the coefficeints shown above represents the formula to predict the log-likelihood of "category"=="Low" versus not "Low". Same rules applies to "Mid" and "High".

In [20]:
import pickle

# Save the model to disk
filename = 'final_l2_model.sav'
pickle.dump(final_l2_model, open(filename, 'wb'))
