# Tenure Data - Predictors Selection

## import packages and functions

In [1]:
import os
print(os.getcwd())

/Users/yukachen/marketing-operation/tenure


In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression, RFECV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split, KFold 
import janitor


In [3]:
tenure_data = pd.read_csv('all_data_tenure.csv')
tenure_data = tenure_data.clean_names()


## 1. load your data

In [4]:
# 1) isolate X & y, drop rows where y is null
X = tenure_data.drop(columns=['tenure_in_complete_months'])
y = tenure_data['tenure_in_complete_months']
mask = y.notna()
X = X.loc[mask].reset_index(drop=True)
y = y.loc[mask].reset_index(drop=True)

In [5]:
# 2) drop  any feature with zero variance(all missing)
X = X.dropna(axis=1, how='all')


In [6]:
# 3)identify numneric vs categorical features
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()


## 2. test/train split



In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [8]:
# 5) cast categoricals to string to avoid mixed-type errors
X_train[cat_cols] = X_train[cat_cols].astype(str)
X_test[cat_cols]  = X_test[cat_cols].astype(str)

## 3. build preprocessing

In [9]:
num_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale',  StandardScaler())
])

cat_pipeline = Pipeline([
    ('encode', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])


## 4. feature‐selection + recursive elimination

In [10]:
#   a) univariate filter → top 50
fs_univariate = SelectKBest(score_func=f_regression, k=50)

In [11]:
#   b) recursive feature elimination with CV
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rfe_cv = RFECV(
    estimator=rf,
    step=5,
    cv=KFold(5, shuffle=True, random_state=42),
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

## 5. full pipeline


In [12]:
pipeline = Pipeline([
    ('prep',   preprocessor),
    ('filter', fs_univariate),
    ('rfe',    rfe_cv)
])

## 6. fit and selet


In [13]:
pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('prep', ...), ('filter', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,score_func,<function f_r...t 0x14d07c940>
,k,50

0,1,2
,estimator,RandomForestR...ndom_state=42)
,step,5
,min_features_to_select,1
,cv,KFold(n_split... shuffle=True)
,scoring,'neg_mean_squared_error'
,verbose,0
,n_jobs,-1
,importance_getter,'auto'

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


 ## 7. map back to feature names

In [14]:
# 1) grab all names out of the preprocessor
prep = pipeline.named_steps['prep']
all_feat_names = prep.get_feature_names_out()

# 2) grab the boolean mask from your SelectKBest filter
fs = pipeline.named_steps['filter']
filter_mask = fs.get_support()  # length = len(all_feat_names)

# 3) reduce the full list to the top‐50 that made it through SelectKBest
feat_names_after_filter = all_feat_names[filter_mask]

# 4) grab the boolean mask from your RFECV
rfe = pipeline.named_steps['rfe']
rfe_mask = rfe.support_          # length = 50

# 5) apply it to the filtered names
selected_features = feat_names_after_filter[rfe_mask]

print("📈 Top predictors of tenure:")
for feat in selected_features:
    print(" •", feat)

📈 Top predictors of tenure:
 • num__tenure_in_complete_days


## view the features

In [15]:
for feat in feat_names_after_filter:
    print(feat)


num__tenure_in_complete_years
num__tenure_in_complete_days
cat__cuisine_type_unknown
cat__service_type_c_nan
cat__lead_first_touch_date_c_nan
cat__date_time_proposal_negotiation_c_nan
cat__recordtypeid_012Uo00000147X8IAI
cat__closed_won_reason_c_nan
cat__wechat_c_nan
cat__stagename_Onboarded
cat__promotions_c_nan
cat__voice_platform_plan_c_Core
cat__voice_platform_plan_c_nan
cat__td_caplan_status_c_nan
cat__monthly_revenue_c_nan
cat__date_time_closed_won_c_nan
cat__iswon_True
cat__td_client_tech_status_c_nan
cat__live_date_c_nan
cat__ownership_type_c_Rent
cat__ownership_type_c_nan
cat__business_phone_c_nan
cat__target_go_live_date_c_nan
cat__interest_level_c_High
cat__interest_level_c_nan
cat__is_oppty_won_c_True
cat__date_time_onboarded_c_nan
cat__pos_first_order_date_c_2016-01-01
cat__lastclosedatechangedhistoryid_nan
cat__lead_type_c_Marketing Inbound
cat__lead_type_c_nan
cat__laststagechangedate_nan
cat__lead_category_c_nan
cat__lastactivitydate_nan
cat__client_type_c_SMB
cat__clie