In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

In [10]:
pwd()

'/home/sunilmishraji01/sunil/Project_Kaggle/Churn_Prediction/notebook'

## 1) Lets load the Telco-Customer-Churn Dataset 

In [4]:
path = "/home/sunilmishraji01/sunil/Project_Kaggle/Churn-Prediction/input/Telco-Customer-Churn.csv"
df = pd.read_csv(path)

In [5]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## About attributes:
* **Prediction column (Target Variable):**

**`Churn`** : Whether the customer churned or not (Yes or No)

* **3 numerical columns:**

**`1. SeniorCitizen :`** Whether the customer is a senior citizen or not (1, 0)

**`2. Tenure :`** Number of months the customer has stayed with the company

**`3. MonthlyCharges :`** The amount charged to the customer monthly


* **17 categorical columns:**

**`1. customerID :`** Customer ID unique for each customer

**`2. gender :`** Whether the customer is a male or a female

**`3. Partner :`** Whether the customer has a partner or not (Yes, No)

**`4. Dependents :`** Whether the customer has dependents or not (Yes, No)

**`5. PhoneService :`** Whether the customer has a phone service or not (Yes, No)

**`6. MultipleLines :`** Whether the customer has multiple lines or not (Yes, No, No phone service)

**`7 InternetService :`** Customer’s internet service provider (DSL, Fiber optic, No)

**`8. OnlineSecurity :`** Whether the customer has online security or not (Yes, No, No internet service)

**`9. OnlineBackup :`** Whether the customer has online backup or not (Yes, No, No internet service)

**`10. DeviceProtection :`** Whether the customer has device protection or not (Yes, No, No internet service)

**`11. TechSupport :`** Whether the customer has tech support or not (Yes, No, No internet service)

**`12. StreamingTV :`** Whether the customer has streaming TV or not (Yes, No, No internet service)

**`13. StreamingMovies :`** Whether the customer has streaming movies or not (Yes, No, No internet service)

**`14. Contract :`** The contract term of the customer (Month-to-month, One year, Two year)

**`15. PaperlessBilling :`** Whether the customer has paperless billing or not (Yes, No)

**`16. PaymentMethod :`** The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))

**`17. TotalCharges :`** The total amount charged to the customer

## 2) Exploratory Data Analysis

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [7]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [8]:
# convert `TotalCharges` into numeric data type:
df.TotalCharges = pd.to_numeric(df.TotalCharges,errors="coerce")

In [9]:
# check missing data:
def missing_data(df):
    # df = name of dataframe
    Total = df.isnull().sum().sort_values(ascending=False)
    Percent = (Total*100/df.isnull().count()).sort_values(ascending=False).round(decimals=3)
    missing_data = pd.concat([Total, Percent],axis=1,keys=["Total","Percent"])
    return missing_data[missing_data.Total>=1]
missing_data(df)

Unnamed: 0,Total,Percent
TotalCharges,11,0.156
