In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.preprocessing import OneHotEncoder

cost_mail = 10

perc_accep_over50k = 0.1
avg_profit_over50k = 980

perc_accep_under50k = 0.05
avg_cost_under50k = -310

In [2]:
df = pd.read_excel('data/existing-customers.xlsx')

  warn("Workbook contains no default style, apply openpyxl's default")


As we can se there's multiple instances where there are missing values. This occurs only in columns that have categorical features. The way we andle that is by creating new column also for the missing values. 

In [3]:
print(df.isnull().sum())

RowID                0
age                  0
workclass         1836
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
class                0
dtype: int64


The data given is unbalanced

In [4]:
print(df['class'].value_counts())

<=50K    24720
>50K      7841
Name: class, dtype: int64


Preprocess

In [5]:
df.drop('education', inplace=True, axis=1)
df['capital'] = df['capital-gain'] - df['capital-loss']
df.drop('capital-gain', inplace=True, axis=1)
df.drop('capital-loss', inplace=True, axis=1)

label = preprocessing.LabelEncoder()
df['sex'] = label.fit_transform(df['sex'])

categorical_cols = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'native-country']

# Use OneHotEncoder to create a sparse matrix of one-hot encoded columns
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
encoded_cols = encoder.fit_transform(df[categorical_cols])

# Create a new DataFrame with the encoded columns and drop the original categorical columns
encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(categorical_cols))
df.drop(categorical_cols, axis=1, inplace=True)

# Concatenate the encoded columns with the original DataFrame
df = pd.concat([df, encoded_df], axis=1)



In [6]:
print(df.shape)

(32561, 91)


In [7]:
df['class'].replace(['<=50K', '>50K'],[0, 1], inplace=True)

X = df.drop(['RowID', 'class'], axis=1)
y = df['class']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [9]:
# Create Decision Tree classifer object
dt = DecisionTreeClassifier()

# Train Decision Tree Classifer
dt = dt.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = dt.predict(X_val)

In [10]:
print("Accuracy:",metrics.accuracy_score(y_val, y_pred))
print("Precision:",metrics.precision_score(y_val, y_pred))
print("Recall:",metrics.recall_score(y_val, y_pred))
print("F1 score:",metrics.f1_score(y_val, y_pred))
print("ROC AUC:",metrics.roc_auc_score(y_val, y_pred))

n = len(y_pred)
n_1 = sum(y_pred)
precision = metrics.precision_score(y_val, y_pred)
exp_val = (n_1/n) * (precision * perc_accep_over50k * avg_profit_over50k - (1- precision) * perc_accep_under50k * avg_cost_under50k - cost_mail)
print('Expected value: ', exp_val)


Accuracy: 0.8094287469287469
Precision: 0.6210392902408112
Recall: 0.6038200862600123
F1 score: 0.612308653545767
ROC AUC: 0.7407523421686644
Expected value:  13.748310810810814


In [11]:
neigh = KNeighborsClassifier(n_neighbors=3)

neigh = neigh.fit(X_train,y_train)

y_pred = neigh.predict(X_val)


In [12]:
print("Accuracy:",metrics.accuracy_score(y_val, y_pred))
print("Precision:",metrics.precision_score(y_val, y_pred))
print("Recall:",metrics.recall_score(y_val, y_pred))
print("F1 score:",metrics.f1_score(y_val, y_pred))
print("ROC AUC:",metrics.roc_auc_score(y_val, y_pred))

n_1 = sum(y_pred)
precision = metrics.precision_score(y_val, y_pred)
exp_val = (n_1/n) * (precision * perc_accep_over50k * avg_profit_over50k - (1- precision) * perc_accep_under50k * avg_cost_under50k - cost_mail)
print('Expected value: ', exp_val)

Accuracy: 0.8335380835380836
Precision: 0.6733118971061093
Recall: 0.6451016635859519
F1 score: 0.6589049716803019
ROC AUC: 0.7705974670967191
Expected value:  14.577702702702702


In [13]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_val)

In [14]:
print("Accuracy:",metrics.accuracy_score(y_val, y_pred))
print("Precision:",metrics.precision_score(y_val, y_pred))
print("Recall:",metrics.recall_score(y_val, y_pred))
print("F1 score:",metrics.f1_score(y_val, y_pred))
print("ROC AUC:",metrics.roc_auc_score(y_val, y_pred))

n_1 = sum(y_pred)
precision = metrics.precision_score(y_val, y_pred)
exp_val = (n_1/n) * (precision * perc_accep_over50k * avg_profit_over50k - (1- precision) * perc_accep_under50k * avg_cost_under50k - cost_mail)
print('Expected value: ', exp_val)


Accuracy: 0.7716523341523341
Precision: 0.52660406885759
Recall: 0.829328404189772
F1 score: 0.6441732471883226
ROC AUC: 0.7909170145309671
Expected value:  19.211148648648653


In [15]:
rf = RandomForestClassifier()

rf = rf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = rf.predict(X_val)

In [16]:
print("Accuracy:",metrics.accuracy_score(y_val, y_pred))
print("Precision:",metrics.precision_score(y_val, y_pred))
print("Recall:",metrics.recall_score(y_val, y_pred))
print("F1 score:",metrics.f1_score(y_val, y_pred))
print("ROC AUC:",metrics.roc_auc_score(y_val, y_pred))

n_1 = sum(y_pred)
precision = metrics.precision_score(y_val, y_pred)
exp_val = (n_1/n) * (precision * perc_accep_over50k * avg_profit_over50k - (1- precision) * perc_accep_under50k * avg_cost_under50k - cost_mail)
print('Expected value: ', exp_val)

Accuracy: 0.835534398034398
Precision: 0.6965811965811965
Recall: 0.6025878003696857
F1 score: 0.6461843409316155
ROC AUC: 0.7577267085301078
Expected value:  13.576013513513512


Naive Bayes Classifier chosen

In [17]:
y_pred = gnb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("F1 score:",metrics.f1_score(y_test, y_pred))
print("ROC AUC:",metrics.roc_auc_score(y_test, y_pred))

n = len(y_pred)
n_1 = sum(y_pred)
precision = metrics.precision_score(y_test, y_pred)
exp_val = (n_1/n) * (precision * perc_accep_over50k * avg_profit_over50k - (1- precision) * perc_accep_under50k * avg_cost_under50k - cost_mail)
print('Expected value: ', exp_val)

Accuracy: 0.7709196990634116
Precision: 0.49899071457408156
Recall: 0.8312037659717552
F1 score: 0.6236125126135217
ROC AUC: 0.7921438646810627
Expected value:  17.748119146322743


### Potential coustomers prediction

In [18]:
df = pd.read_excel('data/potential-customers.xlsx')

  warn("Workbook contains no default style, apply openpyxl's default")


In [19]:
df.drop('education', inplace=True, axis=1)
df['capital'] = df['capital-gain'] - df['capital-loss']
df.drop('capital-gain', inplace=True, axis=1)
df.drop('capital-loss', inplace=True, axis=1)

df['sex'] = label.fit_transform(df['sex'])

In [20]:
encoded_cols = encoder.transform(df[categorical_cols])
# Create a new DataFrame with the encoded columns and drop the original categorical columns
encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(categorical_cols))
df.drop(categorical_cols, axis=1, inplace=True)
# Concatenate the encoded columns with the original DataFrame
df = pd.concat([df, encoded_df], axis=1)

In [21]:
y_pred = gnb.predict(df.drop(['RowID'], axis=1))

over_50k = ['Row'+str(i) for i in range(len(y_pred)) if y_pred[i] == 1]


In [22]:
with open('potential-customers-above-50k.txt', 'w') as f:
    for item in over_50k:
        f.write(str(item) + '\n')

In [23]:
n = len(over_50k)
cost_mail = n * 10
profit = n * (precision * perc_accep_over50k * avg_profit_over50k - (1- precision) * perc_accep_under50k * avg_cost_under50k)

revenue = profit - cost_mail

print('Number of customers over 50k: ', n)
print('Profit: ', profit)
print('Mailing costs: ', cost_mail)
print('Expected Revenues: ', revenue)


Number of customers over 50k:  6118
Profit:  346687.07832054904
Mailing costs:  61180
Expected Revenues:  285507.07832054904
