## Telecom Churn Prediction
- EDA, Data Cleaning, Visualisations
- Feature Engineering
- ML Model Training
- Hyperparameter Training

In [None]:
from utilties import *

In [None]:
sns.set()
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)
pd.set_option('display.width', None)

In [None]:
import session_info
session_info.show()

## Get Data

In [None]:
df = pd.read_excel('data/Data_Science_Task.xlsx', sheet_name='Churn Dataset')
df.shape

In [None]:
X = df.drop(columns='churn')
y = df.loc[:, 'churn']
X_data, X_val, y_data, y_val = train_test_split(X, y, test_size=0.1,random_state=40, shuffle=True)   # seperate val data set
df_val = X_val
df_val['churn']=y_val
df_data = X_data
df_data['churn'] = y_data
df_val.to_pickle('data/df_validation.pkl')
df_data.to_pickle('data/df_data.pkl')

In [None]:
df_val

## Seperate Validation Data (10 per of OrignalData)

In [None]:
print(df_val.shape)
print(df_data.shape)

In [None]:
df_data.info()

## Get Summary of DF

In [None]:
Get_Summary_DF(df_data)

In [None]:
cat_feats = ['gender', 'SeniorCitizen', 'Product: International', 'Product: Voice mail','Phone Code',
             'PaperlessBilling','service calls','churn']
num_feats = list(set(df_data.columns)-set(['customerID','Telephone Number', 'US State']+cat_feats))

## Numeric & Categorical Feats
- 'gender',  'SeniorCitizen', 'Product: International', 'Product: Voice mail','Phone Code','PaperlessBilling','service calls', 'churn'
- 'vmail', 'Total, EUR', 'night calls', 'Duration', 'night minutes', 'internatonal EUR', 'international minutes', 'eve minutes', 'Call day minutes', 'eve calls', 'eve EUR', 'night EUR', 'total day calls', 'international calls'

## Other Potential Feats (if possible to get)

- Age, Martial_Status,Dependents, 
- ContractInfo, CreditHistory, PreviousPackages,PaymentMethod,  BillAmount, InternetPackage, 
- CustomerSurvey, OnlineSecurity 
- Past History of Each User(Last X Months feats could be generated .eg: num_calls, calls_time, internet_usage, complains etc)

## Check Churn Feature

In [None]:
fig, (axes) = plt.subplots(2,1, figsize=(12,10))
fig.subplots_adjust(wspace=0.2, hspace=0.6)
Plot_Bar_Mit_Num_Per(df_data, 'churn', axes[0], 'Churn')
fig.delaxes(axes[1])

## Numeric Feats Correlation

In [None]:
sns.heatmap(df[num_feats].corr())

**Directly Dependent Feats** 
- 'Total EUR'  -> 'Call day minutes'
- 'eve EUR' -> 'eve minutes'
- 'night EUR' -> 'night_minutes'
- 'internatonal EUR' -> 'international_minutes' 

let's drop them:

In [None]:
num_feats = list(set(num_feats)-set(['Total, EUR', 'eve EUR', 'night EUR', 'internatonal EUR']))
corr_matrix = df[num_feats].corr()
sns.heatmap(corr_matrix);

## Box Plot to See Numeric Feats Distribution wrt Churn Feats

In [None]:
fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(10,16))
for idx, feat in enumerate(num_feats):
    ax = axes[int(idx / 2), idx % 2]
    sns.boxplot(x='churn', y = feat, data=df_data, ax=ax)
    ax.set_xlabel('')
    ax.set_ylabel(feat)
fig.tight_layout();

- Call day minutes and vmail shows interesting trend

## Distribution trend for individual Numeric Feats

In [None]:
@interact( feature = list(num_feats))
def Inter_Plot_Num_Churn(feature):
    Numeric_Distribution_Plot(df_data, feature,'churn' )

## Categorical Features wrt Churn Feat

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(12,8))
for idx, feat in enumerate(cat_feats):
    ax = axes[int(idx / 2), idx % 2]
    sns.countplot(x=feat, hue = 'churn', data=df_data, ax=ax)
    ax.set_xlabel('')
    ax.set_ylabel(feat)

In [None]:
Per_Stacked_Bar_Plot_Cat_Feats(cat_feats, 'Churn Rate')

- 'Product International' , 'service calls' column shows interesting result

## Distribution trend for individual Categorical Feats

In [None]:
@interact( feature = cat_feats)
def Inter_Plot_Cat_Churn(feature):
    Plot_Cat_Col(df_data, feature,'churn')

## Churn Analysis wrt States

In [None]:
pd.crosstab(df_data['US State'], df_data['churn'])

In [None]:
churn_state = df_data.groupby(['US State'])['churn'].agg([np.mean]).sort_values(by='mean', ascending=False)
churn_state.plot.bar(title = "Percentage Churn wrt States", figsize=(16,6))
plt.show()

## Facts Churn wrt States
- churn rate in New Jersey and California are above 25% and less than 6% for Hawaii and Alaska. 
- However small dataset is problem to get better insight

## Categorical Feat importance with Mutual_Information Score

In [None]:
def Compute_Mutual_Info(CAT_SERIES):
    return mutual_info_score(CAT_SERIES, df_data.churn)

categorical_variables = df_data[cat_feats].drop('churn', axis=1)
feature_importance = categorical_variables.apply(Compute_Mutual_Info).sort_values(ascending=False)
print(feature_importance)

## Prepare Data 
- Map Categorical Feats
- Scale numeric Feats

In [None]:
df_all_data = df_data[num_feats+cat_feats]
df_all_data = Convert_Cat_Feats(df_all_data)
df_all_data = Scale_Num_Feats_Train(df_all_data, num_feats)  # save max, min values in config file for pred use cases
df_all_data

## Try Following Models to see performance

- 'dummy_classifier'
- 'k_nearest_neighbors'
- 'logistic_regression'
- 'support_vector_machines'
- 'random_forest'
- 'gradient_boosting'

In [None]:
X = df_all_data.drop(columns='churn')
y = df_all_data.loc[:, 'churn']
print(X.columns)
print(y.name)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=40, shuffle=True)

In [None]:
models = create_models()

In [None]:
# test the accuracy of each model using default hyperparameters
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    model.fit(X_train, y_train).predict(X_test)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    results.append(accuracy)
    names.append(name)
    print('Classifier: {}, Accuracy: {})'.format(name, accuracy))

### RF and Gradient Boosting Model shows interesting results

## Default RF Model

In [None]:
# Running the random forest with default parameters
rfc = RandomForestClassifier(class_weight = 'balanced',random_state=100,n_jobs = -1)
rfc.fit(X_train,y_train)
Get_ROC(rfc, X_test, y_test)

## Hyperparameter Tuning for RF Model 

## Tune individual Parameters of RF Model
- max_depth
- n_estimators
- min_samples_split
- min_samples_leaf
- max_features

In [None]:
Tune_Single_Parameter(X_train, y_train, PAR_NAME= 'max_depth', 
                      PARAMETER={'max_depth': range(2, 25, 1)}, N_FOLD=5, TARGET="precision")
Tune_Single_Parameter(X_train, y_train, PAR_NAME= 'n_estimators', 
                      PARAMETER={'n_estimators': [10,25,50,75,100]}, N_FOLD=5, TARGET="precision")
Tune_Single_Parameter(X_train, y_train, PAR_NAME= 'min_samples_split', 
                      PARAMETER={'min_samples_split': range(10,200,10)}, N_FOLD=5, TARGET="precision")
Tune_Single_Parameter(X_train, y_train, PAR_NAME= 'min_samples_leaf', 
                      PARAMETER={'min_samples_leaf': range(10,200,10)}, N_FOLD=3, TARGET="precision")
Tune_Single_Parameter(X_train, y_train, PAR_NAME= 'max_features', 
                      PARAMETER={'max_features': range(5,16,3)}, N_FOLD=3, TARGET="precision")

## Find Best Hyper Parameters for RF Model

In [None]:
param_grid = {
    'max_depth': [10,13,15,17,20],
    'min_samples_split': [5,10,15], 
    'min_samples_leaf': [5,8,11,14,17] }

rf = RandomForestClassifier(class_weight = 'balanced',random_state=100)
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, #scoring='precision',
                          cv = 3, n_jobs = -1,verbose = 1,return_train_score=True)

grid_search.fit(X_train, y_train)
print(grid_search.best_score_)
print(grid_search.best_params_)
print('We can get accuracy of',grid_search.best_score_,'using',grid_search.best_params_)

## Train RF Model With Best Parameters

In [None]:
rfc = RandomForestClassifier(bootstrap=True,random_state=100,
                             max_depth=grid_search.best_params_['max_depth'],
                             min_samples_split=grid_search.best_params_['min_samples_split'],
                              class_weight='balanced')
rfc.fit(X_train,y_train)
Get_ROC(rfc, X_test, y_test)

## Hyperparameter Tuning Increased
- Accuray increase
- AUC increase
- Recall increase

## RF Model Feat Importance

In [None]:
Get_RF_Model_Feat_Importance(rfc, X_train.columns, 17)

## Save Model as Pickle File

In [None]:
pickle.dump(rfc, open(f'model/RF_V001.pkl', 'wb'))

## Get Predictions for New Data Point

In [1]:
from get_pred import Main_Pred

In [2]:
Main_Pred('8022-BECSI')

df       customerID gender  SeniorCitizen US State  Duration  Phone Code  \
1994  8022-BECSI   Male              0       MD        84         510   

     Telephone Number Product: International Product: Voice mail  vmail  ...  \
1994         369-2899                     no                  no      0  ...   

      eve EUR  night minutes  night calls  night EUR  international minutes  \
1994     13.4           98.2           70       4.42                   10.6   

      international calls  internatonal EUR  service calls  PaperlessBilling  \
1994                    7              2.86              0                No   

      churn  
1994  False  

[1 rows x 25 columns]
            vmail  eve calls  night calls  eve minutes  international minutes  \
customerID                                                                      
8022-BECSI    0.0   0.552941     0.260563      0.43416                   0.53   

            Call day minutes  international calls  Duration  night minutes

Feature names must be in the same order as they were in fit.



Unnamed: 0,Prob,ID
0,7,8022-BECSI
