# **📌 Extracción(E - Extract)**

In [1]:
import json
import pandas as pd

In [4]:
from google.colab import files
uploaded = files.upload()

Saving TelecomX_Data.json to TelecomX_Data.json


In [5]:
import io

data = json.load(io.BytesIO(uploaded['TelecomX_Data.json']))

In [6]:
df = pd.json_normalize(
    data,
    sep='.',
    record_prefix=None
)

# 1. Eliminar espacios en blanco en toda la columna (en caso de que haya strings vacíos o espacios)
df['account.Charges.Total'] = df['account.Charges.Total'].str.strip()

# 2. Reemplazar valores vacíos '' por NaN
df['account.Charges.Total'] = df['account.Charges.Total'].replace('', pd.NA)

# 3. Convertir a tipo float
df['account.Charges.Total'] = pd.to_numeric(df['account.Charges.Total'], errors='coerce')

# 4. Verificar tipo final
print("Tipo de dato final:", df['account.Charges.Total'].dtype)
print("Valores nulos tras conversión:", df['account.Charges.Total'].isna().sum())

Tipo de dato final: float64
Valores nulos tras conversión: 11


In [7]:
print(df.shape)
df.head()

(7267, 21)


Unnamed: 0,customerID,Churn,customer.gender,customer.SeniorCitizen,customer.Partner,customer.Dependents,customer.tenure,phone.PhoneService,phone.MultipleLines,internet.InternetService,...,internet.OnlineBackup,internet.DeviceProtection,internet.TechSupport,internet.StreamingTV,internet.StreamingMovies,account.Contract,account.PaperlessBilling,account.PaymentMethod,account.Charges.Monthly,account.Charges.Total
0,0002-ORFBO,No,Female,0,Yes,Yes,9,Yes,No,DSL,...,Yes,No,Yes,Yes,No,One year,Yes,Mailed check,65.6,593.3
1,0003-MKNFE,No,Male,0,No,No,9,Yes,Yes,DSL,...,No,No,No,No,Yes,Month-to-month,No,Mailed check,59.9,542.4
2,0004-TLHLJ,Yes,Male,0,No,No,4,Yes,No,Fiber optic,...,No,Yes,No,No,No,Month-to-month,Yes,Electronic check,73.9,280.85
3,0011-IGKFF,Yes,Male,1,Yes,No,13,Yes,No,Fiber optic,...,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,98.0,1237.85
4,0013-EXCHZ,Yes,Female,1,Yes,No,3,Yes,No,Fiber optic,...,No,No,Yes,Yes,No,Month-to-month,Yes,Mailed check,83.9,267.4


# **🔧 Transformación (T - Transform)**

In [8]:
# Ver tipos de datos y valores no nulos
print("Tipos de datos y valores no nulos:")
print(df.info())


Tipos de datos y valores no nulos:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7267 entries, 0 to 7266
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   customerID                 7267 non-null   object 
 1   Churn                      7267 non-null   object 
 2   customer.gender            7267 non-null   object 
 3   customer.SeniorCitizen     7267 non-null   int64  
 4   customer.Partner           7267 non-null   object 
 5   customer.Dependents        7267 non-null   object 
 6   customer.tenure            7267 non-null   int64  
 7   phone.PhoneService         7267 non-null   object 
 8   phone.MultipleLines        7267 non-null   object 
 9   internet.InternetService   7267 non-null   object 
 10  internet.OnlineSecurity    7267 non-null   object 
 11  internet.OnlineBackup      7267 non-null   object 
 12  internet.DeviceProtection  7267 non-null   object 
 13  internet.Tech

In [9]:
# Resumen de columnas categóricas
print("\nResumen categórico:")
print(df.describe(include=['object', 'bool']))


Resumen categórico:
        customerID Churn customer.gender customer.Partner customer.Dependents  \
count         7267  7267            7267             7267                7267   
unique        7267     3               2                2                   2   
top     9995-HOTOH    No            Male               No                  No   
freq             1  5174            3675             3749                5086   

       phone.PhoneService phone.MultipleLines internet.InternetService  \
count                7267                7267                     7267   
unique                  2                   3                        3   
top                   Yes                  No              Fiber optic   
freq                 6560                3495                     3198   

       internet.OnlineSecurity internet.OnlineBackup  \
count                     7267                  7267   
unique                       3                     3   
top                         No    

In [10]:
# Resumen estadístico de columnas numéricas
print("\nResumen estadístico (numéricas):")
print(df.describe())


Resumen estadístico (numéricas):
       customer.SeniorCitizen  customer.tenure  account.Charges.Monthly  \
count             7267.000000      7267.000000              7267.000000   
mean                 0.162653        32.346498                64.720098   
std                  0.369074        24.571773                30.129572   
min                  0.000000         0.000000                18.250000   
25%                  0.000000         9.000000                35.425000   
50%                  0.000000        29.000000                70.300000   
75%                  0.000000        55.000000                89.875000   
max                  1.000000        72.000000               118.750000   

       account.Charges.Total  
count            7256.000000  
mean             2280.634213  
std              2268.632997  
min                18.800000  
25%               400.225000  
50%              1391.000000  
75%              3785.300000  
max              8684.800000  


In [11]:
# 1. Lista de variables del diccionario
variables_diccionario = [
    "customerID", "Churn", "gender", "SeniorCitizen", "Partner", "Dependents",
    "tenure", "PhoneService", "MultipleLines", "InternetService", "OnlineSecurity",
    "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies",
    "Contract", "PaperlessBilling", "PaymentMethod", "Charges.Monthly", "Charges.Total"
]


In [13]:
# Construcción del diccionario extendido
diccionario_extendido = set(variables_diccionario + [var.split('.')[-1] for var in variables_diccionario])

# 2. Comparar con las columnas del DataFrame
print("Columnas en el DataFrame vs. Diccionario:")
for col in df.columns:
    col_simple = col.split('.')[-1]
    if col_simple in diccionario_extendido:
        print(f"✅ {col} -> Corresponde a '{col_simple}' en el diccionario")
    else:
        print(f"❌ {col} -> No está en el diccionario")

Columnas en el DataFrame vs. Diccionario:
✅ customerID -> Corresponde a 'customerID' en el diccionario
✅ Churn -> Corresponde a 'Churn' en el diccionario
✅ customer.gender -> Corresponde a 'gender' en el diccionario
✅ customer.SeniorCitizen -> Corresponde a 'SeniorCitizen' en el diccionario
✅ customer.Partner -> Corresponde a 'Partner' en el diccionario
✅ customer.Dependents -> Corresponde a 'Dependents' en el diccionario
✅ customer.tenure -> Corresponde a 'tenure' en el diccionario
✅ phone.PhoneService -> Corresponde a 'PhoneService' en el diccionario
✅ phone.MultipleLines -> Corresponde a 'MultipleLines' en el diccionario
✅ internet.InternetService -> Corresponde a 'InternetService' en el diccionario
✅ internet.OnlineSecurity -> Corresponde a 'OnlineSecurity' en el diccionario
✅ internet.OnlineBackup -> Corresponde a 'OnlineBackup' en el diccionario
✅ internet.DeviceProtection -> Corresponde a 'DeviceProtection' en el diccionario
✅ internet.TechSupport -> Corresponde a 'TechSupport' 

In [14]:
columnas_relevantes = [
    'Churn',                     # Target
    'customer.SeniorCitizen',    # ¿Clientes mayores abandonan más?
    'customer.tenure',           # Antigüedad
    'internet.InternetService',  # Tipo de servicio (DSL/Fiber optic)
    'account.Contract',          # Tipo de contrato
    'account.Charges.Monthly'    # Costo mensual
]
df_churn = df[columnas_relevantes].copy()
