In [14]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold

In [37]:
df = pd.read_csv("updated_dataset_with_churn.csv")
# lets drop all rows with missing values, we are doing no cleaning here
df.fillna(0,inplace=True)
df

Unnamed: 0,id,score_date,current_arr,team_plus,pro_plus,ent_plus,csat_score,num_low_sev_outages,num_high_sev_outages,num_tickets_deflected,...,segment_non_smb,segment_commercial,segment_enterprise,segment_midmarket,future_arr,current_arr.1,max_seats,agent_utilization,seat_utilization,churn
0,JMAN_2429072,2023-02-10,2976.00,1,1,0,100,0,0,72,...,0,0,0,0,2976.00,2976.00,2.0,0.500000,1.0,0
1,JMAN_9043466,2023-01-13,20724.72,1,1,1,100,0,0,0,...,1,1,0,0,21474.84,20724.72,9.0,0.111111,1.0,0
2,JMAN_2452556,2023-02-24,605.76,1,1,1,0,15,1,0,...,1,0,0,0,817.80,605.76,1.0,0.000000,1.0,0
3,JMAN_2455113,2023-03-03,300.00,1,0,0,0,7,0,0,...,1,0,0,0,300.00,300.00,1.0,0.000000,1.0,0
4,JMAN_18127937,2023-02-10,2100.00,1,1,0,0,0,0,0,...,1,0,0,0,420.00,2100.00,5.0,0.000000,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13412,JMAN_16222667,2023-01-06,504.00,1,0,0,0,0,0,0,...,1,0,0,0,504.00,504.00,3.0,0.000000,0.0,0
13413,JMAN_2475820,2023-02-10,221.64,1,0,0,0,10,1,0,...,1,0,0,0,231.48,221.64,1.0,0.000000,1.0,0
13414,JMAN_9014565,2023-03-17,228.00,1,0,0,0,10,0,0,...,1,0,0,0,228.00,228.00,1.0,0.000000,1.0,0
13415,JMAN_18362858,2023-03-24,228.00,1,0,0,0,0,0,0,...,1,0,0,0,228.00,228.00,1.0,0.000000,0.0,0


In [38]:
df.dtypes

id                        object
score_date                object
current_arr              float64
team_plus                  int64
pro_plus                   int64
ent_plus                   int64
csat_score                 int64
num_low_sev_outages        int64
num_high_sev_outages       int64
num_tickets_deflected      int64
customer_age_quarters      int64
product_counts             int64
crm_employee_range        object
region_emea                int64
region_apac                int64
region_amer                int64
region_latam               int64
segment_smb                int64
segment_non_smb            int64
segment_commercial         int64
segment_enterprise         int64
segment_midmarket          int64
future_arr               float64
current_arr.1            float64
max_seats                float64
agent_utilization        float64
seat_utilization         float64
churn                      int64
dtype: object

In [39]:
df.drop('id', axis=1,inplace=True)
df.drop('score_date', axis=1,inplace=True)
df.drop('crm_employee_range', axis=1,inplace=True)

In [41]:
df['churn'].value_counts() # damn imbalanced

churn
0    11246
1     2171
Name: count, dtype: int64

# Training a default LogReg model

In [43]:
X = df.drop('churn', axis=1)
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42, #used to keep class distribution in train and test sets similar
)

In [44]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

pred = lr.predict(X_test)
pred_train = lr.predict(X_train)

print(classification_report(y_test, pred))

print(classification_report(y_train, pred_train))

              precision    recall  f1-score   support

           0       0.92      0.99      0.95      2265
           1       0.89      0.56      0.69       419

    accuracy                           0.92      2684
   macro avg       0.91      0.77      0.82      2684
weighted avg       0.92      0.92      0.91      2684

              precision    recall  f1-score   support

           0       0.92      0.99      0.95      8981
           1       0.92      0.55      0.69      1752

    accuracy                           0.92     10733
   macro avg       0.92      0.77      0.82     10733
weighted avg       0.92      0.92      0.91     10733



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Precision -> TP / (TP + FP)
# Recall -> TP / (TP + FN)
# weighted_precision = (precision_0 * support_0 + precision_1 * support_1) / total_support

## Tuning

In [45]:
X_train.shape

(10733, 24)

In [46]:
lr = LogisticRegression(
    #penalty='l2',
    #solver='liblinear',
    class_weight='balanced'
)
lr.fit(X_train, y_train)

pred = lr.predict(X_test)
pred_train = lr.predict(X_train)

print(classification_report(y_test, pred))

print(classification_report(y_train, pred_train))

lr.coef_

              precision    recall  f1-score   support

           0       0.94      0.96      0.95      2265
           1       0.77      0.67      0.72       419

    accuracy                           0.92      2684
   macro avg       0.86      0.81      0.83      2684
weighted avg       0.91      0.92      0.91      2684

              precision    recall  f1-score   support

           0       0.94      0.96      0.95      8981
           1       0.78      0.67      0.72      1752

    accuracy                           0.91     10733
   macro avg       0.86      0.82      0.83     10733
weighted avg       0.91      0.91      0.91     10733



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[ 0.00110098, -0.00448836, -0.00032546, -0.00190058, -0.00100197,
        -0.01314831, -0.00103946, -0.02388693, -0.03059686,  0.02945695,
        -0.00211969,  0.00798377, -0.00542858,  0.00015033, -0.01021502,
         0.01080086,  0.00579275,  0.00231697, -0.00114389, -0.00207869,
         0.00110098, -0.13403338, -0.00278069,  0.00262719]])

## Surely you cannot test every parameter manually right?

In [47]:
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs', 'saga'],
    'penalty': ['l2'],
    'max_iter': [100, 200, 500],
    'class_weight': [None, 'balanced']
}

grid = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid, cv=5, scoring='f1', verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)
print(grid.best_params_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
{'C': 0.01, 'class_weight': None, 'max_iter': 500, 'penalty': 'l2', 'solver': 'saga'}




In [48]:
from scipy.stats import uniform
param_dist = {
    'C': uniform(0.01,10),
    'solver': ['saga'],
    'penalty': ['l2'],
    'max_iter': [100, 200, 500],
    'class_weight': [None, 'balanced']
}

stratified_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(estimator=LogisticRegression(), param_distributions=param_dist, 
                                   n_iter=2, cv=stratified_cv, verbose=1, random_state=42)
random_search.fit(X_train, y_train)
print(random_search.best_params_)

Fitting 3 folds for each of 2 candidates, totalling 6 fits




{'C': np.float64(3.7554011884736247), 'class_weight': None, 'max_iter': 500, 'penalty': 'l2', 'solver': 'saga'}




In [49]:
random_search.best_params_

{'C': np.float64(3.7554011884736247),
 'class_weight': None,
 'max_iter': 500,
 'penalty': 'l2',
 'solver': 'saga'}

In [50]:
lr = LogisticRegression(**{'C': 3.7554011884736247,
 'class_weight': None,
 'max_iter': 500,
 'penalty': 'l2',
 'solver': 'liblinear'})
lr.fit(X_train, y_train)

pred = lr.predict(X_test)
pred_train = lr.predict(X_train)

print(classification_report(y_test, pred))

print(classification_report(y_train, pred_train))

              precision    recall  f1-score   support

           0       0.92      0.99      0.95      2265
           1       0.89      0.56      0.69       419

    accuracy                           0.92      2684
   macro avg       0.91      0.77      0.82      2684
weighted avg       0.92      0.92      0.91      2684

              precision    recall  f1-score   support

           0       0.92      0.99      0.95      8981
           1       0.92      0.55      0.69      1752

    accuracy                           0.92     10733
   macro avg       0.92      0.77      0.82     10733
weighted avg       0.92      0.92      0.91     10733



In [51]:
uniform(0.01,10).rvs(10)

array([4.50075736, 9.8863495 , 5.03565495, 4.79505241, 0.24318119,
       8.33147905, 1.85077239, 3.4931003 , 1.9345901 , 5.83476308])