In [1]:
import pandas as pd
import numpy as np

In [2]:
transact_train_sample = pd.DataFrame(
    {
        "customer_id": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
        "amount": [1, 3, 12, 6, 0.5, 0.2, np.nan, 5, np.nan, 3],
        "transaction_date": [
            '2022-01-01',
            '2022-08-01',
            None,
            '2022-12-01',
            '2022-02-01',
            None,
            '2022-02-01',
            '2022-01-01',
            '2022-11-01',
            '2022-01-01'
        ],
        "outcome" : [False, True, True, True, False, False, True, True, True, False]
    }
)

In [3]:
from data_modeler import DataModeler

In [4]:
transactions_modeler = DataModeler(transact_train_sample)

In [5]:
transactions_modeler.prepare_data()

Unnamed: 0,amount,transaction_month,transaction_day
0,1.0,1.0,1.0
1,3.0,8.0,1.0
2,12.0,,
3,6.0,12.0,1.0
4,0.5,2.0,1.0
5,0.2,,
6,,2.0,1.0
7,5.0,1.0,1.0
8,,11.0,1.0
9,3.0,1.0,1.0


In [6]:
transactions_modeler.sample_df.dtypes

customer_id                   int64
amount                      float64
transaction_date     datetime64[ns]
outcome                        bool
transaction_month           float64
transaction_day             float64
dtype: object

In [7]:
transactions_modeler.impute_missing()

Unnamed: 0,amount,transaction_month,transaction_day
0,1.0,1.0,1.0
1,3.0,8.0,1.0
2,12.0,4.75,1.0
3,6.0,12.0,1.0
4,0.5,2.0,1.0
5,0.2,4.75,1.0
6,3.8375,2.0,1.0
7,5.0,1.0,1.0
8,3.8375,11.0,1.0
9,3.0,1.0,1.0


In [8]:
transactions_modeler.sample_df

Unnamed: 0,customer_id,amount,transaction_date,outcome,transaction_month,transaction_day
0,11,1.0,2022-01-01,False,1.0,1.0
1,12,3.0,2022-08-01,True,8.0,1.0
2,13,12.0,NaT,True,,
3,14,6.0,2022-12-01,True,12.0,1.0
4,15,0.5,2022-02-01,False,2.0,1.0
5,16,0.2,NaT,False,,
6,17,,2022-02-01,True,2.0,1.0
7,18,5.0,2022-01-01,True,1.0,1.0
8,19,,2022-11-01,True,11.0,1.0
9,20,3.0,2022-01-01,False,1.0,1.0


In [9]:
transactions_modeler.fit()

In [10]:
transactions_modeler.model_summary()

'Model: Logistic Regression\nNumber of features: 3\nNumber of samples: 10\nModel coefficients:\n[[ 1.12982108e+00  4.51313702e-01 -5.28120981e-05]]\nModel intercept: [-4.4623267]\nClassification Report:\n              precision    recall  f1-score   support\n\n       False       1.00      1.00      1.00         4\n        True       1.00      1.00      1.00         6\n\n    accuracy                           1.00        10\n   macro avg       1.00      1.00      1.00        10\nweighted avg       1.00      1.00      1.00        10\n'

In [11]:
in_sample_predictions = transactions_modeler.predict()

In [12]:
in_sample_predictions

0    False
1     True
2     True
3     True
4    False
5    False
6     True
7     True
8     True
9    False
Name: predicted_outcome, dtype: bool

In [13]:
f'Accuracy = {sum(in_sample_predictions ==  [False, True, True, True, False, False, True, True, True, False])/.1}%'

'Accuracy = 100.0%'

In [14]:
transactions_modeler.save("transact_modeler")

In [15]:
loaded_modeler = DataModeler.load("transact_modeler")


In [16]:
loaded_modeler.model_summary()

'Model: Logistic Regression\nNumber of features: 3\nNumber of samples: 10\nModel coefficients:\n[[ 1.12982108e+00  4.51313702e-01 -5.28120981e-05]]\nModel intercept: [-4.4623267]\nClassification Report:\n              precision    recall  f1-score   support\n\n       False       1.00      1.00      1.00         4\n        True       1.00      1.00      1.00         6\n\n    accuracy                           1.00        10\n   macro avg       1.00      1.00      1.00        10\nweighted avg       1.00      1.00      1.00        10\n'

In [17]:
# On new data
new_data = pd.DataFrame({
    "customer_id": [16, 17],
    "amount": [2, 8],
    "transaction_date": ['2022-03-01', '2022-07-01'],
})
new_predictions = loaded_modeler.predict(new_data)

In [18]:
new_predictions

0    False
1     True
Name: predicted_outcome, dtype: bool

In [19]:
loaded_modeler.save('transact_modeler.pkl')


In [20]:
loaded_modeler = DataModeler.load('transact_modeler.pkl')


In [21]:
loaded_modeler.model_summary()

'Model: Logistic Regression\nNumber of features: 3\nNumber of samples: 10\nModel coefficients:\n[[ 1.12982108e+00  4.51313702e-01 -5.28120981e-05]]\nModel intercept: [-4.4623267]\nClassification Report:\n              precision    recall  f1-score   support\n\n       False       1.00      1.00      1.00         4\n        True       1.00      1.00      1.00         6\n\n    accuracy                           1.00        10\n   macro avg       1.00      1.00      1.00        10\nweighted avg       1.00      1.00      1.00        10\n'