# Customer Churn Prediction 2020

Public score: 0.98222

In [None]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
!pip list | grep mlfoundry

# Load data into pandas dataframe

In [None]:
df = pd.read_csv('train.csv')
df.head()

In [None]:
df.info()

In [None]:
from getpass import getpass
api_token = getpass("TrueFoundry API Token:")

In [None]:
import mlfoundry as mlf
mlf_api = mlf.get_client(api_key=api_token)

# cleaning the data 
* first calculate the total_net_minutes to reduce the number of features; we are going to do the same with calls, and charge 
* second we are going to convert all yes, no strings into ints such as in columns (voice_mail_plan, international_plan, and churn)
* then we are going to convert the categorical values into onehote vectors such as (state, and area_code)
* lastly drop all repeted features and useless columns such as area (code and state)   


In [None]:
def clean_Data(df):    
    df['total_net_minutes'] = df['total_day_minutes'] + df['total_eve_minutes'] + df['total_night_minutes']
    df['total_net_calls'] = df['total_day_calls'] + df['total_eve_calls'] + df['total_night_calls']
    df['total_net_charge'] = df['total_day_charge'] + df['total_eve_charge'] + df['total_night_charge']


    df['voice_mail_plan'] = df['voice_mail_plan'].map({'yes': 1, 'no': 0}) 
    df['international_plan'] = df['international_plan'].map({'yes': 1, 'no': 0}) 

    df.drop(columns= ['state', 'area_code'], inplace= True)
    #df.area_code = pd.Categorical(df.area_code).codes



    df.drop(columns=['total_day_charge', 'total_eve_charge','total_night_charge',
                    'total_day_calls','total_eve_calls', 'total_night_calls', 'total_day_minutes', 
                     'total_eve_minutes', 'total_night_minutes'], inplace=True)
    return df



In [None]:
df.columns

# spliting the data 
* we are going to use sklearn to split the data 
* first we need to split the dataframe into x, y 
* then use train-test-spilt function to split the data 
* use random state to have same data each time you run the program 
* use stratify to cut the data with the same portion


In [None]:
from sklearn.model_selection import train_test_split 

X = df.drop(columns= ['churn'])
y = df['churn']

x_train, x_val, y_train, y_val = train_test_split(X, y , test_size=.25, stratify= y, random_state=1) 
y_train.value_counts(), y_val.value_counts()

# preform the cleaing by calling the clean function 

In [None]:
x_train  = clean_Data(x_train)
y_train = pd.Categorical(y_train).codes

x_val  = clean_Data(x_val)
y_val= pd.Categorical(y_val).codes

print(df.international_plan.value_counts())
df.head()

# display all numerical columns in the data

In [None]:
!pip install seaborn
import seaborn as sns
sns.set_style('dark')
temp = x_train[['account_length', 'international_plan','voice_mail_plan', 'number_vmail_messages', 
           'total_net_minutes','total_net_calls', 'total_net_charge','total_intl_minutes',
       'total_intl_calls', 'total_intl_charge','number_customer_service_calls' ]]
temp.hist(bins=50,figsize=(20,20),color='navy');

# create some models 

## first we start with simple model such as LogisticRegression 
* it preform well but not the best 

In [None]:
from sklearn.linear_model import LogisticRegression

lr  = LogisticRegression(max_iter=300,)
lr .fit(x_train,y_train )
lr .score(x_train, y_train)

In [None]:
pre = lr .predict(x_val)
score = [i for i, j in zip(pre, y_val) if i == j]

score = len(score)/len(y_val)
score

In [None]:
mlf_run = mlf_api.create_run(project_name='customer-churn-kaggle-project', run_name='logistic-regression')

mlf_run.log_dataset(
    dataset_name = 'raw_dataset',
    features = df,
    only_stats = False,   
)

mlf_run.log_dataset(
    dataset_name = 'train',
    features = x_train,
    predictions = lr.predict(x_train),
    actuals = y_train,
    only_stats = False,   
)

mlf_run.log_dataset(
    dataset_name = 'val',
    features = x_val,
    predictions = lr.predict(x_val),
    actuals = y_val,
    only_stats = False,   
)


mlf_run.log_metrics({'score': score})
mlf_run.log_model(lr, framework=mlf.ModelFramework.SKLEARN)
mlf_run.log_params(lr.get_params())



## Frist we start with RandomForestClassifier 
**then we use the random forest classifier** 

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(x_train,y_train )
clf.score(x_train, y_train)

In [None]:
p = x_val.head(1).to_json()
p_df = pd.read_json(p)
p_df

In [None]:
def new_predict():
    

In [None]:
pre = clf.predict(x_val)
score = [i for i, j in zip(pre, y_val) if i == j]

score = len(score)/len(y_val)
score

In [None]:
mlf_run = mlf_api.create_run(project_name='customer-churn-kaggle-project', run_name='random-forest')

mlf_run.log_dataset(
    dataset_name = 'raw_dataset',
    features = df,
    only_stats = False,   
)

mlf_run.log_dataset(
    dataset_name = 'train',
    features = x_train,
    predictions = lr.predict(x_train),
    actuals = y_train,
    only_stats = False,   
)

mlf_run.log_dataset(
    dataset_name = 'val',
    features = x_val,
    predictions = lr.predict(x_val),
    actuals = y_val,
    only_stats = False,   
)


mlf_run.log_metrics({'score': score})
mlf_run.log_model(clf, framework=mlf.ModelFramework.SKLEARN)
mlf_run.log_params(clf.get_params())


## lasly we used  GradientBoostingClassifier


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

xg = GradientBoostingClassifier(learning_rate=0.01, n_estimators=400,max_depth=13)
xg.fit(x_train,y_train )
xg.score(x_train, y_train)

In [None]:
pre = xg.predict(x_val)
score = [i for i, j in zip(pre, y_val) if i == j]

score = len(score)/len(y_val)
score

In [None]:
mlf_run = mlf_api.create_run(project_name='customer-churn-kaggle-project', run_name='xgboost')

mlf_run.log_dataset(
    dataset_name = 'raw_dataset',
    features = df,
    only_stats = False,   
)

mlf_run.log_dataset(
    dataset_name = 'train',
    features = x_train,
    predictions = lr.predict(x_train),
    actuals = y_train,
    only_stats = False,   
)

mlf_run.log_dataset(
    dataset_name = 'val',
    features = x_val,
    predictions = lr.predict(x_val),
    actuals = y_val,
    only_stats = False,   
)


mlf_run.log_metrics({'score': score})
mlf_run.log_model(xg, framework=mlf.ModelFramework.SKLEARN)
mlf_run.log_params(xg.get_params())



In [None]:
x_train.shape, y_train.shape, lr.predict(x_train).shape

# loading the test data 

In [None]:
test = pd.read_csv('test.csv')
x_test =test.drop(columns='id')

# Test cleaning 
**clean the test data using the same function we used for train data cleaning**

In [None]:
x_test = clean_Data(x_test)
x_test.head()

# use model to predict the values 
## first use the xg boost to predict the test values 
After we use the model to predict the data we save the values in csv file to use in the submission 

In [None]:
pre = xg.predict(x_test)
print(pre[:5])
ansXG = pd.read_csv('../input/customer-churn-prediction-2020/sampleSubmission.csv')
ansXG.churn = pre
ansXG.churn= ansXG.churn.map({ 1: 'yes', 0 : 'no'}) 
ansXG.to_csv('sampleSubmissionXG.csv', index=False)
ansXG.head()

## then use the randomforest 


In [None]:
pre = clf.predict(x_test)
print(pre[:5])
ansCLF = pd.read_csv('../input/customer-churn-prediction-2020/sampleSubmission.csv')
ansCLF.churn = pre
ansCLF.churn= ansCLF.churn.map({ 1: 'yes', 0 : 'no'}) 
ansCLF.to_csv('sampleSubmissionCLF.csv', index=False)
ansCLF.head()

**to check which one will be better for the data but both have the same accuracy value**

# model saving
Using joblib to dump the models into joblib files with the model name 

In [None]:
from joblib import dump
dump(clf, 'clf.joblib') 
dump(xg, 'xg.joblib') 