In [None]:
#Carga de las librerías
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import HalvingRandomSearchCV

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Leer el dataset
url = 'https://raw.githubusercontent.com/Geerdata/DS/main/Datacoder/Arc.%20Modelo/WA_Fn-UseC_-Telco-Customer-Churn.csv'
data= pd.read_csv(url,sep=',')
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
#Separamos en X e y
X = data[['InternetService', 'TotalCharges', 'Contract', 'PaymentMethod']].copy()
y = data["Churn"]

In [None]:
x = pd.get_dummies(X)

In [None]:
#Separamos en train y test!
(X_train, X_test,y_train, y_test) = train_test_split(x,y,stratify=y,test_size=0.30,random_state=42)

In [None]:
y_test.value_counts()

Churn
No     1552
Yes     561
Name: count, dtype: int64

In [None]:
#Normalizamos los datos para que PCA funcione mejor!
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
#Creamos un random forest!
model = RandomForestClassifier(random_state=42, n_estimators=100,
                               class_weight="balanced", max_features="log2")
model.fit(X_train, y_train)

In [None]:
#Prediccion en Test
y_test_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
reporte=classification_report(y_test,y_test_pred)
print(reporte)

              precision    recall  f1-score   support

          No       0.82      0.87      0.85      1552
         Yes       0.58      0.48      0.53       561

    accuracy                           0.77      2113
   macro avg       0.70      0.68      0.69      2113
weighted avg       0.76      0.77      0.76      2113



**Randomized Search CV**

In [None]:
params_grid = {
        'max_depth': [5,6,7],
        'criterion':['entropy','gini']
        }

In [None]:
# tiempo de ejecucuon 7 seg
grid_cv = RandomizedSearchCV(model, params_grid, scoring="accuracy", n_jobs=-1, cv=3)
grid_cv.fit(X_train, y_train)

print("Mejores parametros", grid_cv.best_params_)
print("Mejor score de CV", grid_cv.best_score_)
print(f'Accuracy del modelo = {round(accuracy_score(y_test, grid_cv.predict(X_test)), 5)}')



Mejores parametros {'max_depth': 6, 'criterion': 'gini'}
Mejor score de CV 0.7127852981929778
Accuracy del modelo = 0.70989
