In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.pipeline import Pipeline

In [24]:
df = pd.read_csv("../data/processed/data.csv")
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [25]:
numerical_features = ['age', 'balance', 'duration', 'DaysSinceLastContact']
categorical_features = ['job', 'marital', 'education', 'contact']
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with the preprocessor
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit and transform the data
transformed_data = pipeline.fit_transform(df)

print(transformed_data)

[[ 1.81041533  0.29917431  0.02381101 ...  0.          0.
   1.        ]
 [ 0.35838246 -0.42885395 -0.40030458 ...  0.          0.
   1.        ]
 [-0.78250052 -0.43815233 -0.6894743  ...  0.          0.
   1.        ]
 ...
 [ 1.3955488  -0.36996425 -0.32704825 ...  0.          0.
   0.        ]
 [-0.67878388 -0.0782708   0.3361143  ...  0.          0.
   0.        ]
 [-0.26391735  0.05776097 -0.54681724 ...  0.          0.
   0.        ]]


In [40]:
X_train, X_test, y_train, y_test = train_test_split(transformed_data, y, test_size=0.2)
model = LogisticRegression(max_iter=50000)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

f1_1 = f1_score(y_test, y_pred, average='binary')
print(f1_1)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print(cm)

print(cr)

0.3379224030037547
Accuracy: 0.93
[[7336   92]
 [ 437  135]]
              precision    recall  f1-score   support

           0       0.94      0.99      0.97      7428
           1       0.59      0.24      0.34       572

    accuracy                           0.93      8000
   macro avg       0.77      0.61      0.65      8000
weighted avg       0.92      0.93      0.92      8000



In [38]:
X_train, X_test, y_train, y_test = train_test_split(transformed_data, y, test_size=0.2, random_state = 32)
model = DecisionTreeClassifier()
# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

f1_1 = f1_score(y_test, y_pred, average='binary')
print(f1_1)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print(cm)

print(cr)

Accuracy: 0.91
0.4262023217247098
[[7051  369]
 [ 323  257]]
              precision    recall  f1-score   support

           0       0.96      0.95      0.95      7420
           1       0.41      0.44      0.43       580

    accuracy                           0.91      8000
   macro avg       0.68      0.70      0.69      8000
weighted avg       0.92      0.91      0.92      8000

