# About
The task is to build a machine learning model that predicts whether a customer will discontinue traveling, i.e. churn.

# Import

In [1]:
from copy import deepcopy

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import precision_recall_curve, precision_score, recall_score, accuracy_score, roc_curve, auc, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import f_classif


from scipy.stats import loguniform, uniform

# Load data

In [2]:
# Check the format
! head 'data.csv'

customer_id,gender,is_senior,is_married,visited_bistro,avg_monthly_trips,visited_lounge,residential_area,weekly_email,loyaly_membership,first_class_primarily,rebooking_option,customer_support_usage,primary_travel_option,avg_monthly_discount,control_group,churn
1,Female,No,Yes,No,2,No,suburb,No,basic,No,No,Medium,Unspecified,24.8,No,Yes
2,Male,No,No,No,82,Yes,urban,No,grey,Yes,Yes,High,Commute,25.25,Yes,No
3,Female,No,Yes,Yes,104,Yes,suburb,Yes,white,Yes,Yes,High,Commute,19.35,No,No
4,Female,No,No,No,2,Yes,suburb,Yes,basic,No,No,Medium,Recreational,76.35,No,Yes
5,Male,No,No,No,134,Yes,suburb,Yes,grey,No,No,Medium,Unspecified,50.55,No,No
6,Female,Yes,Yes,No,136,Yes,urban,No,grey,No,No,Low,Unspecified,89.6,Yes,Yes
7,Female,No,Yes,Yes,46,Yes,urban,No,grey,No,No,Medium,Recreational,77.15,No,No
8,Male,No,Yes,Yes,144,Yes,urban,Yes,black,No,No,Low,Recreational,72.1,No,No
9,Male,No,No,No,140,Yes,urban,No,black,No,No,Medium,Unspecified,104.0,No,Yes


In [3]:
df = pd.read_csv('data.csv')

# Check data

In [4]:
df.shape

(7043, 17)

In [5]:
df.sample(5)

Unnamed: 0,customer_id,gender,is_senior,is_married,visited_bistro,avg_monthly_trips,visited_lounge,residential_area,weekly_email,loyaly_membership,first_class_primarily,rebooking_option,customer_support_usage,primary_travel_option,avg_monthly_discount,control_group,churn
2990,2991,Male,No,Yes,No,48,Yes,suburb,Yes,white,Yes,Yes,High,Commute,20.1,Yes,No
6494,6495,Male,No,No,No,50,Yes,suburb,No,white,No,No,Low,Unspecified,79.0,Yes,No
1877,1878,Male,No,Yes,Yes,126,Yes,urban,Yes,white,Yes,Yes,High,Commute,25.25,No,No
3939,3940,Female,No,Yes,No,32,Yes,suburb,Yes,white,Yes,Yes,High,Commute,19.6,No,No
6826,6827,Male,No,No,No,42,Yes,suburb,Yes,white,No,No,Medium,Recreational,61.65,No,No


In [6]:
df.dtypes

customer_id                 int64
gender                     object
is_senior                  object
is_married                 object
visited_bistro             object
avg_monthly_trips          object
visited_lounge             object
residential_area           object
weekly_email               object
loyaly_membership          object
first_class_primarily      object
rebooking_option           object
customer_support_usage     object
primary_travel_option      object
avg_monthly_discount      float64
control_group              object
churn                      object
dtype: object

* Mostly categorical/binary variables
* The `avg_` variables are numerical, but one, `avg_monthly_trips`, seems to have the wrong type (string)

In [7]:
type(df['avg_monthly_trips'].iloc[0])

str

In [8]:
df['customer_id'].nunique() == len(df)

True

* The customer ids are unique, and should probably not be considered features as we can assume they are randomly assigned
* Also means we should not need to consider duplicates

In [9]:
df.isna().mean()

customer_id               0.000000
gender                    0.001846
is_senior                 0.001846
is_married                0.000000
visited_bistro            0.006247
avg_monthly_trips         0.000000
visited_lounge            0.000000
residential_area          0.000000
weekly_email              0.006247
loyaly_membership         0.000000
first_class_primarily     0.000000
rebooking_option          0.000000
customer_support_usage    0.000000
primary_travel_option     0.000000
avg_monthly_discount      0.000000
control_group             0.000000
churn                     0.000000
dtype: float64

* Not many null, and it seems reasonable to impute the ones we have with e.g. average values

In [10]:
df['churn'].value_counts()

churn
No     5174
Yes    1869
Name: count, dtype: int64

In [11]:
# Target variable
df['churn'].value_counts(normalize=True)

churn
No     0.73463
Yes    0.26537
Name: proportion, dtype: float64

* Reasonably balanced dataset

In [12]:
df['visited_bistro'].value_counts()

visited_bistro
No     4901
Yes    2098
Name: count, dtype: int64

In [13]:
# Check the different values of each column
for k in df:
    if df[k].nunique() < 20:
        print(f"{k}:", df[k].value_counts().to_dict())
    else:
        print(f"{k}:", df[k].nunique(), "unqiue values")

customer_id: 7043 unqiue values
gender: {'Male': 3547, 'Female': 3483}
is_senior: {'No': 5891, 'Yes': 1139}
is_married: {'No': 3641, 'Yes': 3402}
visited_bistro: {'No': 4901, 'Yes': 2098}
avg_monthly_trips: 74 unqiue values
visited_lounge: {'Yes': 6361, 'No': 682}
residential_area: {'suburb': 4072, 'urban': 2971}
weekly_email: {'No': 4147, 'Yes': 2852}
loyaly_membership: {'basic': 2365, 'white': 1612, 'grey': 1544, 'black': 1522}
first_class_primarily: {'No': 5517, 'Yes': 1526}
rebooking_option: {'No': 5517, 'Yes': 1526}
customer_support_usage: {'Medium': 3088, 'Low': 2429, 'High': 1526}
primary_travel_option: {'Unspecified': 3095, 'Recreational': 2422, 'Commute': 1526}
avg_monthly_discount: 1586 unqiue values
control_group: {'No': 3535, 'Yes': 3508}
churn: {'No': 5174, 'Yes': 1869}


# Visualize

In [14]:
yname='churn'

In [15]:
# Temporary dataframe for plotting
# Convert the churn to an integer to be able to plot a numerical churn rate
tmp = df.assign(churn_=df['churn'].map({"Yes": 1, "No": 0}))
tmp['avg_monthly_trips'] = pd.to_numeric(tmp['avg_monthly_trips'], errors='coerce')
tmp.loc[tmp['avg_monthly_discount'] == tmp['avg_monthly_discount'].max(), 'avg_monthly_discount'] = np.nan

In [16]:
# Make a 4x4 plotly subplot with the first 16 columns of df plotted as histograms
fig = make_subplots(rows=4, cols=4, subplot_titles=df.columns[:16], )
for i, col in enumerate(df.columns[:16]):
    fig.add_trace(go.Histogram(x=tmp[col], y=tmp['churn_'], name=col,histfunc='avg',), row=i//4+1, col=i%4+1, )
    # Add a line for the average churn rate
    fig.add_hline(tmp['churn_'].mean(), row=i//4+1, col=i%4+1, line=dict(color='silver', width=2, dash='dash'))
    fig.update_xaxes(title_text=col, row=i//4+1, col=i%4+1)
    fig.update_yaxes(range=[0,1], row=i//4+1, col=i%4+1)


fig.update_layout(height=1000, title='Churn rate by feature')
fig.show()

* This is a useful plot to have for reference after our modeling, for interpreting the model and also potentially for how to take action to reduce churn.

* There are several features that appear predictive for the churn since there is a trend over the feature (e.g. `is_senior`, `avg_monthly_trips`)

* There are also features that _don't_ appear very predictive (e.g. `gender`, `visited_lounge`).

* We can also verify that the churn over customer id looks pretty random, as expected, so we can drop that.

In [17]:
# Also ahve a look at the distribution of the numerical values
df[['avg_monthly_trips','avg_monthly_discount']].describe()

Unnamed: 0,avg_monthly_discount
count,7043.0
mean,774.654693
std,26635.109607
min,18.25
25%,35.575
50%,70.35
75%,89.9
max,999999.0


* There are large outliers in `avg_monthly_discount` that we need to take care of

In [18]:
# In this case we can see manually find and remove the large values. All have the same number, equal to the maximium value.
# This can be seen by looking at the diff of sorted values
df['avg_monthly_discount'].sort_values().diff().iloc[-10:]

6240         0.15
6215         0.25
5514         0.00
1319         0.05
1183         0.10
132     999880.25
185          0.00
1807         0.00
3825         0.00
2715         0.00
Name: avg_monthly_discount, dtype: float64

# Basic processing

In [19]:
def process(df):
    # Drop irrelevant
    df = df.drop(columns=['customer_id'])

    # Fix numerical types
    df['avg_monthly_trips'] = pd.to_numeric(df['avg_monthly_trips'], errors='coerce')

    # Set the outlier values (=the max value here) to nan
    # TODO: this is not very robust
    df.loc[df['avg_monthly_discount'] == df['avg_monthly_discount'].max(), 'avg_monthly_discount'] = np.nan

    # Drop null values for now (~2% of the data)
    # TODO could impute with a typical value
    df = df.dropna()

    # One-hot encode the categorical variables
    df = pd.get_dummies(df, drop_first=True)

    return df

In [20]:

dfp = process(df)

In [21]:
dfp

Unnamed: 0,avg_monthly_trips,avg_monthly_discount,gender_Male,is_senior_Yes,is_married_Yes,visited_bistro_Yes,visited_lounge_Yes,residential_area_urban,weekly_email_Yes,loyaly_membership_black,loyaly_membership_grey,loyaly_membership_white,first_class_primarily_Yes,rebooking_option_Yes,customer_support_usage_Low,customer_support_usage_Medium,primary_travel_option_Recreational,primary_travel_option_Unspecified,control_group_Yes,churn_Yes
0,2.0,24.80,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,True,False,True
1,82.0,25.25,True,False,False,False,True,True,False,False,True,False,True,True,False,False,False,False,True,False
2,104.0,19.35,False,False,True,True,True,False,True,False,False,True,True,True,False,False,False,False,False,False
3,2.0,76.35,False,False,False,False,True,False,True,False,False,False,False,False,False,True,True,False,False,True
4,134.0,50.55,True,False,False,False,True,False,True,False,True,False,False,False,False,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7037,2.0,25.10,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,True,True,True
7038,2.0,95.00,True,False,True,False,True,False,False,False,False,False,False,False,False,True,False,True,True,True
7039,46.0,91.10,False,False,True,True,True,True,False,True,False,False,False,False,True,False,True,False,False,False
7041,24.0,99.45,True,True,False,False,True,True,False,False,False,False,False,False,False,True,True,False,True,True


In [None]:
# Define x and y, input/output
yname = 'churn_Yes'
assert yname in dfp

xnames = list(dfp.columns.drop(yname))
xnames

['avg_monthly_trips',
 'avg_monthly_discount',
 'gender_Male',
 'is_senior_Yes',
 'is_married_Yes',
 'visited_bistro_Yes',
 'visited_lounge_Yes',
 'residential_area_urban',
 'weekly_email_Yes',
 'loyaly_membership_black',
 'loyaly_membership_grey',
 'loyaly_membership_white',
 'first_class_primarily_Yes',
 'rebooking_option_Yes',
 'customer_support_usage_Low',
 'customer_support_usage_Medium',
 'primary_travel_option_Recreational',
 'primary_travel_option_Unspecified',
 'control_group_Yes']

# Rank features with f-statistic
This gives a more simple and quantitive view of the information in the histograms above

In [25]:
fstats, pvals = f_classif(dfp[xnames], dfp[yname])
scores = -np.log10(pvals)
scores /= scores.max()

assert len(fstats) == len(pvals) == len(xnames)

In [27]:
tmp = pd.DataFrame({'fstat': fstats, 'pval': pvals, 'score': scores, 'feature': xnames}).sort_values('score', ascending=True)
px.line(tmp, x='fstat', y='feature', hover_data=['fstat', 'pval'], markers=True, width=800, height=600)

* 4 features have especially low significance (low fstat) `gender`, `control_group`, `visited_lounge`, `residential_area`
* `avg_monthly_trips` has the highest value, so appears to be the most important feature. We'll use it for a simple baseline model below.

# Train, test split
Keep a hold-out set for final evaluation.

(Note: we could have done this already at the beginning, before most exploratory analysis)

In [28]:
df_train, df_test = train_test_split(dfp,  test_size=0.2, random_state=0, shuffle=True)

In [29]:

dfp.shape, df_train.shape, df_test.shape

((6919, 20), (5535, 20), (1384, 20))

In [30]:
df_train

Unnamed: 0,avg_monthly_trips,avg_monthly_discount,gender_Male,is_senior_Yes,is_married_Yes,visited_bistro_Yes,visited_lounge_Yes,residential_area_urban,weekly_email_Yes,loyaly_membership_black,loyaly_membership_grey,loyaly_membership_white,first_class_primarily_Yes,rebooking_option_Yes,customer_support_usage_Low,customer_support_usage_Medium,primary_travel_option_Recreational,primary_travel_option_Unspecified,control_group_Yes,churn_Yes
5103,138.0,24.25,False,False,True,False,True,True,True,False,True,False,True,True,False,False,False,False,True,False
2864,130.0,108.80,False,True,True,False,True,True,False,False,True,False,False,False,True,False,True,False,True,False
1127,6.0,34.25,False,False,False,False,False,False,True,False,False,True,False,False,False,True,False,True,True,True
5496,24.0,49.85,False,False,False,False,False,False,True,False,False,True,False,False,False,True,True,False,False,False
3112,70.0,70.30,False,False,True,False,True,False,False,False,False,False,False,False,False,True,False,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5029,142.0,19.90,False,False,True,True,True,False,True,True,False,False,True,True,False,False,False,False,True,False
3334,62.0,73.90,False,True,False,False,True,True,False,False,False,False,False,False,False,True,False,True,True,True
1684,42.0,41.90,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,True,True,True
2661,34.0,80.85,True,False,True,True,True,False,True,False,False,False,False,False,False,True,False,True,False,True


# Model search with cross validation

Try a few basic modeling approaches and pick the best one using cross-validation. Here I opt for a logistic regression which is interpretable and simple, with all features and with a single feature (the latter as a kind of baseline). Also, for some non-linear approach I consider a K-nearest neighbors classifier.

In [46]:
cv_splitter = KFold(n_splits=10, shuffle=True, random_state=0)


## KNN search

In [32]:
KNeighborsClassifier().get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [34]:

pipeline = Pipeline(steps=[
    ('preprocessor', MinMaxScaler()),
    ('classifier',  KNeighborsClassifier())
])

model_params_to_vary = {
    "classifier__n_neighbors": np.arange(1, 50, 2),
    "classifier__weights": ['uniform', 'distance'],
    "classifier__p": [1, 2],
    }
search_knn = RandomizedSearchCV(
        pipeline,
        param_distributions=model_params_to_vary,
        scoring='accuracy',
        cv=cv_splitter,
        verbose=1,
        random_state=0,
        # num param values to sample
        n_iter=100,
        n_jobs=-1,
    )

In [35]:
xnames

['avg_monthly_trips',
 'avg_monthly_discount',
 'gender_Male',
 'is_senior_Yes',
 'is_married_Yes',
 'visited_bistro_Yes',
 'visited_lounge_Yes',
 'residential_area_urban',
 'weekly_email_Yes',
 'loyaly_membership_black',
 'loyaly_membership_grey',
 'loyaly_membership_white',
 'first_class_primarily_Yes',
 'rebooking_option_Yes',
 'customer_support_usage_Low',
 'customer_support_usage_Medium',
 'primary_travel_option_Recreational',
 'primary_travel_option_Unspecified',
 'control_group_Yes']

In [36]:
search_knn.fit(df_train[xnames], df_train[yname])

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


In [37]:
# The best mean score, and stdev
pd.DataFrame(search_knn.cv_results_).query("rank_test_score == 1")[['mean_test_score', 'std_test_score']].iloc[0]

mean_test_score    0.774887
std_test_score     0.023188
Name: 38, dtype: float64

In [42]:
search_knn.best_params_

{'classifier__weights': 'uniform',
 'classifier__p': 2,
 'classifier__n_neighbors': np.int64(19)}

## Logistic regression search

In [47]:

pipeline = Pipeline(steps=[
    ('preprocessor', MinMaxScaler()),
    ('classifier', LogisticRegression(random_state=0, penalty='elasticnet', solver='saga', max_iter=1000))
])

model_params_to_vary = {
    "classifier__C": loguniform(1e-3, 1e10),
    "classifier__l1_ratio": np.linspace(0,1, endpoint=True, num=5),
    "classifier__class_weight": ['balanced', None],
    }

search = RandomizedSearchCV(
        pipeline,
        param_distributions=model_params_to_vary,
        scoring='accuracy',
        cv=cv_splitter,
        verbose=1,
        random_state=0,
        # num param values to sample
        n_iter=100,
        n_jobs=-1,
    )

In [48]:
search_logreg = RandomizedSearchCV(
        pipeline,
        param_distributions=model_params_to_vary,
        scoring='accuracy',
        cv=cv_splitter,
        verbose=1,
        random_state=0,
        # num param values to sample
        n_iter=100,
        n_jobs=-1,
    )

## Logistic regression search, single feature

In [49]:
xnames_base = ['avg_monthly_trips']

In [50]:
xnames_base

['avg_monthly_trips']

In [51]:
search_logreg_base = deepcopy(search_logreg)

In [52]:
search_logreg_base.fit(df_train[xnames_base], df_train[yname])

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


In [53]:
pd.DataFrame(search_logreg_base.cv_results_).query("rank_test_score == 1")[['mean_test_score', 'std_test_score']].iloc[0]

mean_test_score    0.752841
std_test_score     0.017374
Name: 0, dtype: float64

## Logistic regression search, all features

In [54]:
xnames

['avg_monthly_trips',
 'avg_monthly_discount',
 'gender_Male',
 'is_senior_Yes',
 'is_married_Yes',
 'visited_bistro_Yes',
 'visited_lounge_Yes',
 'residential_area_urban',
 'weekly_email_Yes',
 'loyaly_membership_black',
 'loyaly_membership_grey',
 'loyaly_membership_white',
 'first_class_primarily_Yes',
 'rebooking_option_Yes',
 'customer_support_usage_Low',
 'customer_support_usage_Medium',
 'primary_travel_option_Recreational',
 'primary_travel_option_Unspecified',
 'control_group_Yes']

In [55]:
search_logreg.fit(df_train[xnames], df_train[yname])

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


In [56]:
best = pd.DataFrame(search_logreg.cv_results_).query("rank_test_score == 1")[['mean_test_score', 'std_test_score']].iloc[0]
best

mean_test_score    0.796026
std_test_score     0.016409
Name: 25, dtype: float64

In [58]:
search_logreg.best_params_

{'classifier__C': np.float64(1.9624149015576506),
 'classifier__class_weight': None,
 'classifier__l1_ratio': np.float64(1.0)}

* L1 penalty was selected as the best (`l1_ratio=1`)

# Summarize results and select model

In [66]:
search_results = []
for model_name, search in [('logreg', search_logreg),('logreg_base', search_logreg_base), ('knn', search_knn)]:
    best = pd.DataFrame(search.cv_results_).query("rank_test_score == 1")[['mean_test_score', 'std_test_score']].iloc[0]
    search_results.append({'model_name': model_name, **best.to_dict()})
search_results = pd.DataFrame(search_results)
search_results['std_error'] = search_results['std_test_score'] / np.sqrt(cv_splitter.n_splits)

In [67]:
px.scatter(pd.DataFrame(search_results),
           x='model_name', y='mean_test_score', error_y='std_error', title='Model results from cross-validation',   )

* Logistic regression with all features perform the best so we will use this as the final model.

# Validate on test set

In [68]:
def get_confusion_matrix(ytrue, ypred, **kwargs):
    cm = confusion_matrix(ytrue, ypred, **kwargs)
    cm = pd.DataFrame(cm)
    cm.axes[0].name = "True"
    cm.axes[1].name = "Predicted"
    return cm

def specificity(cm):
    return cm.iloc[0,0] / cm.iloc[0,:].sum()

In [69]:
clf_val = deepcopy(search_logreg.best_estimator_)

In [70]:
# Train on the full training set
clf_val.fit(df_train[xnames], df_train[yname])

In [71]:
yhat_test = clf_val.predict(df_test[xnames])
y_test = df_test[yname]
probs_test  = clf_val.predict_proba(df_test[xnames])[:,1]

In [72]:
# Confusion matrics and other metrics
cm = get_confusion_matrix(y_test, yhat_test)
display(cm)

print("Metrics on test set:\n"
      f"recall={recall_score(y_test, yhat_test):.3f}\n"
      f"precision={precision_score(y_test, yhat_test):.3f}\n"
      f"accuracy={accuracy_score(y_test, yhat_test):.3f}\n",
      f"specificity={specificity(cm):.3f}\n",
     )

Predicted,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,925,100
1,186,173


Metrics on test set:
recall=0.482
precision=0.634
accuracy=0.793
 specificity=0.902



* True positive rate / recall is quite low ~50%: meaning we only catch anout half of the people that churn
* The true negative rate / specificity is much higher > 90%, contribution to the overall higher accuracy. This means that out of the catch most of the people that stayed/ did not churn.
* Depending on our business goals and how we are plannign to use the model we might want to update our metrics and/or use threshold tuning. For instance, we could bias toward higher recall.

In [73]:
# Precision recall curve
precision, recall, thresholds = precision_recall_curve(y_test, probs_test)
thresholds = [*thresholds, 1.0]
df_pr = pd.DataFrame({'precision': precision, 'recall': recall, 'threshold': thresholds})

# Plot precision-recall curve with tuned and default threhsholds points marked out
fig = px.line(df_pr, x='recall', y='precision',  markers=False,  hover_data='threshold', title="Precision vs recall")#width=500, height=500,)

thresh_default = 0.5
idefault = np.argmin((np.array(thresholds) - thresh_default)**2)
fig.add_trace(go.Scatter(
    x=[df_pr.iloc[idefault].recall],
    y=[df_pr.iloc[idefault].precision],
    mode='markers',
    name=f'default threshold ({thresh_default})',
    marker_size=10,
    marker_color='orange'
    )
)

fig


# Fit final model 
Fit on all data

In [74]:
clf_final = deepcopy(search_logreg.best_estimator_)

In [75]:
# Train on the full training set
clf_final.fit(dfp[xnames], dfp[yname])

In [76]:
# Have a look at the coefficients
coefs = clf_final.steps[-1][1].coef_
tmp = pd.DataFrame({'abs_coef': np.abs(coefs.ravel()), 'coef': (coefs.ravel()),'feature': xnames}).sort_values('abs_coef', ascending=True)
px.scatter(tmp, y='feature', x='coef', width=800, height=600, hover_data=['coef', 'abs_coef'], title="Model coefficients")

* The L1 regularization can lead to sparse coefficients and here we see that some coefficients were indeed zeroed. 
* Interestingly, it's not exactly the same set of features that had the lowest signficance based on the f-statistic. This could indicate that there are some correlations between the features, making the assingment of importance unstable.


In [78]:
# 4 features with lowest f-statistic / highest p-value
pd.DataFrame({'fstat': fstats, 'pval': pvals, 'score': scores, 'feature': xnames}).sort_values('score', ascending=True).iloc[:5]

Unnamed: 0,fstat,pval,score,feature
2,0.181421,0.6701685,0.000861,gender_Male
18,0.944742,0.3310953,0.002377,control_group_Yes
6,1.0528,0.3048991,0.002554,visited_lounge_Yes
7,10.653782,0.001103799,0.014641,residential_area_urban
16,30.321644,3.791668e-08,0.036743,primary_travel_option_Recreational


In [79]:
# 4 features with smallest model coefficients
tmp.iloc[:4]

Unnamed: 0,abs_coef,coef,feature
12,0.0,0.0,first_class_primarily_Yes
13,0.0,0.0,rebooking_option_Yes
4,0.012489,0.012489,is_married_Yes
2,0.014027,0.014027,gender_Male


# Summary, next steps

* A model with reasonable performance has been trained using logistic regression and all input variables
* Perfomance metrics on test set:
    - recall=0.482
    - precision=0.634
    - accuracy=0.793
    - specificity=0.902

* As a next step we should think close about how we want to use the model, this might influence the metrics we use for model selection and also how the threshold is tuned.
* Modeling-wise, it may be worth better understanding the where the model fails and do e.g. some error analysis. This might guide us in improving the model.