In [4]:
import sys
import pandas as pd
import numpy as np
import xgboost as xgb
import yaml
from sklearn.metrics import mutual_info_score
from xgboost import XGBClassifier
from typing import Dict
print("User Current Version:-", sys.version)

User Current Version:- 3.8.15 (default, Nov 24 2022, 09:04:07) 
[Clang 14.0.6 ]


In [7]:
def parse_cfg() -> Dict:
    """Reading in the config

    Returns:
        dict: with keys representing the parameters
    """
    with open("../config/catalog.yml", "r", encoding="utf-8") as yamlfile:
        cfg = yaml.load(yamlfile, Loader=yaml.FullLoader)
    return cfg

In [8]:
parse_cfg()

{'data_local_directory': {'path': './data/telco_customer_churn.csv'},
 'split_size': 0.2,
 'RS': 42,
 'project_id': 'sacred-garden-369506',
 'dataset_id': 'personal_project',
 'table_id': 'telco-customer_churn',
 'cat_columns': ['gender',
  'Partner',
  'Dependents',
  'PhoneService',
  'MultipleLines',
  'InternetService',
  'OnlineSecurity',
  'OnlineBackup',
  'DeviceProtection',
  'TechSupport',
  'StreamingTV',
  'StreamingMovies',
  'Contract',
  'PaperlessBilling',
  'PaymentMethod']}

In [None]:
from google.cloud import bigquery

# Construct a BigQuery client object.
client = bigquery.Client.from_service_account_json("../config/sacred-garden-369506-870f85c5921d.json")

query = """
    SELECT *
    FROM `sacred-garden-369506.personal_project.telco-customer_churn`
"""
query_job = (
    client
    .query(query).to_dataframe()
    .query("TotalCharges!=' '")
    .assign(TotalCharges = lambda x:x.TotalCharges.astype(float))
    .assign(SeniorCitizen = lambda x:x.SeniorCitizen.astype(str))
    .reset_index(drop=True)
)

df = query_job.drop('customerID',axis=1)


In [None]:
df.head()

In [None]:
df.info()

In [None]:
cat_list = df.select_dtypes(exclude=['float64','Int64']).columns.tolist()
cat_list = [e for e in cat_list if e not in ('customerID', 'Churn')]

In [None]:
mutual_info_dict = {}
for i in cat_list:
    mutual_info_dict[i] = mutual_info_score(df[i],df.Churn)

In [None]:
dict(sorted(mutual_info_dict.items(), key=lambda item: item[1],reverse=True))

In [None]:
cat_list

In [None]:
gender_encoded = pd.DataFrame(pd.get_dummies(df['gender'],prefix='gender',prefix_sep=':'))

partner_encoded = pd.DataFrame(pd.get_dummies(df['Partner'],prefix='Partner',prefix_sep=':'))

dependents_encoded = pd.DataFrame(pd.get_dummies(df['Dependents'],prefix='Dependents',prefix_sep=':'))

phone_service_encoded = pd.DataFrame(pd.get_dummies(df['PhoneService'],prefix='PhoneService',prefix_sep=':'))

multiple_lines_encoded = pd.DataFrame(pd.get_dummies(df['MultipleLines'],prefix='MultipleLines',prefix_sep=':'))

internet_service_encoded = pd.DataFrame(pd.get_dummies(df['InternetService'],prefix='InternetService',prefix_sep=':'))

online_security_encoded = pd.DataFrame(pd.get_dummies(df['OnlineSecurity'],prefix='OnlineSecurity',prefix_sep=':'))

online_backup_encoded = pd.DataFrame(pd.get_dummies(df['OnlineBackup'],prefix='OnlineBackup',prefix_sep=':'))

device_protection_encoded = pd.DataFrame(pd.get_dummies(df['DeviceProtection'],prefix='DeviceProtection',prefix_sep=':'))

tech_support_encoded = pd.DataFrame(pd.get_dummies(df['TechSupport'],prefix='TechSupport',prefix_sep=':'))

streaming_tv_encoded = pd.DataFrame(pd.get_dummies(df['StreamingTV'],prefix='StreamingTV',prefix_sep=':'))

streaming_movie_encoded = pd.DataFrame(pd.get_dummies(df['StreamingMovies'],prefix='StreamingMovies',prefix_sep=':'))

contract_encoded = pd.DataFrame(pd.get_dummies(df['Contract'],prefix='Contract',prefix_sep=':'))

paperless_billing_encoded = pd.DataFrame(pd.get_dummies(df['PaperlessBilling'],prefix='PaperlessBilling',prefix_sep=':'))

payment_method_encoded = pd.DataFrame(pd.get_dummies(df['PaymentMethod'],prefix='PaymentMethod',prefix_sep=':'))

In [None]:
df_final = pd.concat([gender_encoded,partner_encoded,dependents_encoded,phone_service_encoded,multiple_lines_encoded,
        internet_service_encoded,online_security_encoded,online_backup_encoded,device_protection_encoded,
        tech_support_encoded,streaming_tv_encoded,streaming_movie_encoded,contract_encoded,
        paperless_billing_encoded,payment_method_encoded,df['tenure'],df['SeniorCitizen'].astype(int),df['MonthlyCharges'],
        df['TotalCharges'],df['Churn']],axis=1)

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df_final.Churn)
df_final.Churn = le.transform(df_final.Churn)


In [None]:
from sklearn.model_selection import train_test_split
X,y = df_final.drop('Churn',axis=1),df_final['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
xgb_cl = XGBClassifier()
xgb_cl.fit(X_train,y_train)

In [None]:
y_pred = xgb_cl.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)