In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score

In [26]:
df = pd.read_csv("../data/processed/data.csv")
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [27]:
column_transformer = ColumnTransformer(
    transformers=[
        ('job', OneHotEncoder(), [1]),
        ('marital', OneHotEncoder(), [2]),
        ('education', OneHotEncoder(), [3]),
        ('contact', OneHotEncoder(), [5]),
        ('numerical', StandardScaler(), [0, 4, 6, 11]),
         ],
    remainder='passthrough'  # Pass through any other columns as they are
)

In [28]:
X_preprocessed = column_transformer.fit_transform(X)
print(X_preprocessed)
print(type(X_preprocessed))

[[0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 1. ... 0. 1. 1.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]]
<class 'numpy.ndarray'>


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=804)

model = LogisticRegression(max_iter=50000)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print(cm)

print(cr)

Accuracy: 0.94
[[7317   93]
 [ 419  171]]
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      7410
           1       0.65      0.29      0.40       590

    accuracy                           0.94      8000
   macro avg       0.80      0.64      0.68      8000
weighted avg       0.92      0.94      0.92      8000



In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=142)
model = DecisionTreeClassifier()
# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

f1_1 = f1_score(y_test, y_pred, average='binary')
print(f1_1)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print(cm)

print(cr)

Accuracy: 0.92
0.4933993399339934
[[7087  316]
 [ 298  299]]
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      7403
           1       0.49      0.50      0.49       597

    accuracy                           0.92      8000
   macro avg       0.72      0.73      0.73      8000
weighted avg       0.92      0.92      0.92      8000

