# Tenure Data - Predictors Selection

## import packages and functions

In [11]:
import os
print(os.getcwd())
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression, RFECV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split, KFold 
import janitor

/Users/yukachen/marketing-operation/tenure


## Summary of NA
Get the summary of how many NAs each features have, and dropped the features with NA % that is more than 70%

In [12]:
from IPython.display import display

# load and clean your data (if you can't use janitor, just lower-case/snake-case manually)
tenure_data = pd.read_csv('tenure_data_lead_zip.csv')

tenure_data = tenure_data.clean_names()

# compute missing-value summary
na_summary = (
    tenure_data.isna().sum()
    .reset_index(name='na_count')
    .rename(columns={'index': 'column'})
)
na_summary['na_pct'] = na_summary['na_count'] / len(tenure_data)

# display inline
display(na_summary)


Unnamed: 0,column,na_count,na_pct
0,unique_code,0,0.000000
1,tenure_in_complete_months,52,0.051485
2,zip_code,0,0.000000
3,state_code,0,0.000000
4,region,0,0.000000
...,...,...,...
337,density_band,0,0.000000
338,median_household_income,0,0.000000
339,median_income_band,0,0.000000
340,timezone,0,0.000000


In [13]:
# set your missing‐value threshold
threshold = 0.78

# identify columns to drop
cols_to_drop = (
    na_summary
    .loc[na_summary['na_pct'] > threshold, 'column']
    .tolist()
)

# drop them from the DataFrame
tenure_data_dropped_78 = tenure_data.drop(columns=cols_to_drop)

In [14]:
# define the patterns to exclude
exclude_patterns = [
    '_airbyte',
    'id',
    'unique_code',
    'html',
    'address',
    'totango',
    'ringdna',
    'url',
    'name',
    'isdeleted',
    'donotcall',
    'phone',
    'wechat',
    'description',
    'company',
    'report_filter_indicator_c',
    'clicksendsms_is_sms_sent_c',
    'restaurant_time_display_c',
    'pulley_reviewed_c',
    'reason',
    'completed_time_c',
    'mentioned_competitor_c',
    'salutation'
]

# build a single boolean mask over the columns
mask = ~tenure_data_dropped_78.columns.str.contains(
    '|'.join(exclude_patterns),
    case=False,
    regex=True
)

# apply it once to drop all those columns in one go
tenure_data_cleaned = tenure_data_dropped_78.loc[:, mask]

In [15]:
census_data = pd.read_csv('extra_census_data.csv')

In [16]:
tenure_data_w_new_census = tenure_data_cleaned.merge(
    census_data,
    how='left',
    left_on='zip_code',
    right_on='zip_code'
)

In [17]:
# make sure they’re datetimes
td = tenure_data_w_new_census
td["latest_mql_timestamp_c"]  = pd.to_datetime(td["latest_mql_timestamp_c"])
td["first_mql_timestamp_c"]   = pd.to_datetime(td["first_mql_timestamp_c"])

# this yields a Timedelta column
td["mql_qualified_duration"] = (
    td["latest_mql_timestamp_c"] - td["first_mql_timestamp_c"]
)
td["mql_qualified_days"] = td["mql_qualified_duration"].dt.days

tenure_data = td


In [21]:
import pandas as pd
import matplotlib.pyplot as plt

df = tenure_data.copy()
numeric = df.select_dtypes(include="number")
# Compute correlations
corr_matrix = numeric.corr()

# ─── 2. Extract correlations against the target ─────────────────────────────
target = "tenure_in_complete_months"
# Drop the self-correlation
corr_with_target = corr_matrix[target].drop(target)

# ─── 3. Tabular view ───────────────────────────────────────────────────────
corr_table = corr_with_target.reset_index()
corr_table.columns = ["feature", "correlation_with_tenure"]
# Sort by absolute strength if you like:
corr_table["abs_corr"] = corr_table["correlation_with_tenure"].abs()
corr_table = corr_table.sort_values("abs_corr", ascending=False).drop(columns="abs_corr")
print(corr_table)

corr_table.to_csv("feature_correlations.csv", index=False)


                            feature  correlation_with_tenure
8   days_since_assigned_to_person_c                 0.307209
3        days_since_last_activity_c                 0.280514
24           mql_qualified_duration                -0.155711
9                  expected_gmv_2_c                -0.141465
5           duration_sdr_assigned_c                -0.129737
22                        pct_asian                -0.121968
17                      asian_alone                -0.120896
23                      pct_chinese                -0.097304
18                  chinese_persons                -0.096974
4           duration_sdr_complete_c                -0.060778
13                          density                 0.060468
12                       population                -0.059722
14                        total_pop                -0.052401
0                          zip_code                -0.051161
11                       zip_code_1                -0.051161
2            duration_sd

In [22]:
# all columns
all_cols = set(df.columns)
# columns used in corr (plus the target)
num_cols = set(numeric.columns)  
dropped = sorted(all_cols - num_cols - {target})
print(f"Numeric cols used for corr: {len(num_cols)}")
print(f"Dropped cols (non-numeric or constant): {len(dropped)}\n", dropped)

Numeric cols used for corr: 27
Dropped cols (non-numeric or constant): 67
 ['actual_lead_source_c', 'ae_assigned_time_c', 'assigned_sdr_c', 'assigned_sdr_role_c', 'automation_bypass_c', 'automation_timestamp_c', 'bypass_owner_assignment_c', 'city', 'city_1', 'client_language_c', 'converted_time_c', 'converteddate', 'country', 'countrycode', 'createddate', 'currently_using_competitor_c', 'currently_using_competitor_online_c', 'date_assigned_to_person_c', 'density_band', 'direct_manager_sdr_c', 'enterprise_c', 'first_mel_timestamp_c', 'first_mql_timestamp_c', 'first_touch_date_c', 'hasoptedoutofemail', 'interest_level_c', 'is_menu_uploaded_c', 'is_user_owner_c', 'isconverted', 'isunreadbyowner', 'last_activity_date_c', 'lastmodifieddate', 'latest_mql_timestamp_c', 'lead_assigned_time_c', 'lead_category_c', 'lead_created_day_c', 'lead_created_time_c', 'lead_source_c', 'lead_type_c', 'median_household_income', 'median_income_band', 'menu_approval_c', 'menu_complexity_c', 'menu_type_c', 'mo

In [None]:

# ─── 4. Heat-map view ──────────────────────────────────────────────────────
# We'll plot a 1×N image where each cell is the corr value
features = corr_with_target.index.tolist()
values   = corr_with_target.values.reshape(1, -1)

fig, ax = plt.subplots()
im = ax.imshow(values, aspect="auto")   # default colormap
# X axis: feature names
ax.set_xticks(range(len(features)))
ax.set_xticklabels(features, rotation=90, fontsize=8)
# Y axis: just a dummy label
ax.set_yticks([0])
ax.set_yticklabels([f"corr → {target}"])
ax.set_title("Feature Correlation with Tenure (months)")
plt.tight_layout()
plt.show()

## 1. load your data

In [23]:
# 1) isolate X & y, drop rows where y is null
X = tenure_data.drop(columns=['tenure_in_complete_months'])
y = tenure_data['tenure_in_complete_months']
mask = y.notna()
X = X.loc[mask].reset_index(drop=True)
y = y.loc[mask].reset_index(drop=True)

In [24]:
# 2) drop  any feature with zero variance(all missing)
X = X.dropna(axis=1, how='all')


In [28]:
# 3)identify numneric vs categorical features
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()


## 2. test/train split



In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [30]:
# 5) cast categoricals to string to avoid mixed-type errors
X_train[cat_cols] = X_train[cat_cols].astype(str)
X_test[cat_cols]  = X_test[cat_cols].astype(str)

## 3. build preprocessing

In [31]:
num_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale',  StandardScaler())
])

cat_pipeline = Pipeline([
    ('encode', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])


## 4. feature‐selection + recursive elimination

In [32]:
#   a) univariate filter → top 50
fs_univariate = SelectKBest(score_func=f_regression, k=50)

In [33]:
#   b) recursive feature elimination with CV
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rfe_cv = RFECV(
    estimator=rf,
    step=5,
    cv=KFold(5, shuffle=True, random_state=42),
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

## 5. full pipeline


In [34]:
pipeline = Pipeline([
    ('prep',   preprocessor),
    ('filter', fs_univariate),
    ('rfe',    rfe_cv)
])

## 6. fit and selet


In [35]:
pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('prep', ...), ('filter', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,score_func,<function f_r...t 0x124de1b40>
,k,50

0,1,2
,estimator,RandomForestR...ndom_state=42)
,step,5
,min_features_to_select,1
,cv,KFold(n_split... shuffle=True)
,scoring,'neg_mean_squared_error'
,verbose,0
,n_jobs,-1
,importance_getter,'auto'

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


 ## 7. map back to feature names

In [36]:
# 1) grab all names out of the preprocessor
prep = pipeline.named_steps['prep']
all_feat_names = prep.get_feature_names_out()

# 2) grab the boolean mask from your SelectKBest filter
fs = pipeline.named_steps['filter']
filter_mask = fs.get_support()  # length = len(all_feat_names)

# 3) reduce the full list to the top‐50 that made it through SelectKBest
feat_names_after_filter = all_feat_names[filter_mask]

# 4) grab the boolean mask from your RFECV
rfe = pipeline.named_steps['rfe']
rfe_mask = rfe.support_          # length = 50

# 5) apply it to the filtered names
selected_features = feat_names_after_filter[rfe_mask]

print("📈 Top predictors of tenure:")
for feat in selected_features:
    print(" •", feat)

📈 Top predictors of tenure:
 • num__days_since_last_activity_c
 • num__days_since_assigned_to_person_c
 • cat__lead_created_time_c_2023-03-23 02:09:38.000 Z
 • cat__service_type_c_Full Time
 • cat__service_type_c_nan
 • cat__converteddate_2022-11-08
 • cat__first_mel_timestamp_c_nan
 • cat__promotions_c_Client Referral (Referrer $300);Printer Promotion;50%/25%/10% 3 Month FT Discount
 • cat__promotions_c_Printer Promotion;Apple Promotion
 • cat__menu_complexity_c_Basic Menu
 • cat__menu_complexity_c_nan
 • cat__automation_timestamp_c_nan
 • cat__createddate_2023-03-23 02:09:38.000 Z
 • cat__interest_level_c_High
 • cat__menu_approval_c_Approved
 • cat__menu_approval_c_Not Submitted
 • cat__actual_lead_source_c_nan
 • cat__direct_manager_sdr_c_0058b00000HCkC1AAL
 • cat__direct_manager_sdr_c_nan
 • cat__lead_category_c_nan
 • cat__assigned_sdr_c_0058b00000Fyza3AAB
 • cat__date_assigned_to_person_c_2023-03-16
 • cat__assigned_sdr_role_c_SDR team user
 • cat__city_1_Youngstown
 • cat__medi

## view the features

In [None]:
for feat in feat_names_after_filter:
    print(feat)
