# <font color=''>1.Import the "Telecom_Customer_Churn.csv" dataset</font>

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
data = pd.read_csv('TelecomCustomer_Churn.csv')
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1.0,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,750.0,29.85,No
1,5575-GNVDE,Male,0,No,No,34.0,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2.0,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45.0,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,F,0,No,No,2.0,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


# <font color=''>2. Explore the dataset to understand its structure and content </font>

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7037 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7042 non-null   object 
 4   Dependents        7042 non-null   object 
 5   tenure            7038 non-null   float64
 6   PhoneService      7042 non-null   object 
 7   MultipleLines     7041 non-null   object 
 8   InternetService   7040 non-null   object 
 9   OnlineSecurity    7042 non-null   object 
 10  OnlineBackup      7040 non-null   object 
 11  DeviceProtection  7041 non-null   object 
 12  TechSupport       7040 non-null   object 
 13  StreamingTV       7040 non-null   object 
 14  StreamingMovies   7041 non-null   object 
 15  Contract          7038 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [4]:
data.shape

(7043, 21)

In [5]:
data.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,7043.0,7038.0,7041.0,7029.0
mean,0.162005,32.347826,65.236266,2280.425452
std,0.368481,24.564499,34.563849,2265.763259
min,0.0,0.0,10.0,18.8
25%,0.0,9.0,35.5,399.6
50%,0.0,29.0,70.35,1396.0
75%,0.0,55.0,89.85,3784.0
max,1.0,72.0,750.0,8684.8


# 3.Remove any duplicate records from the dataset

In [8]:
data[data.duplicated(subset=['customerID'])].T

Unnamed: 0,14,30,42,60
customerID,9237-HQITU,9237-HQITU,9237-HQITU,9237-HQITU
gender,Female,Female,Female,Female
SeniorCitizen,0,0,0,0
Partner,No,No,No,No
Dependents,No,No,No,No
tenure,2.0,2.0,2.0,2.0
PhoneService,Yes,Yes,Yes,Yes
MultipleLines,No,No,No,No
InternetService,,Fiber optic,Fiber optic,Fiber optic
OnlineSecurity,No,No,No,No


In [9]:
data = data.drop_duplicates(subset=['customerID'])

In [10]:
data.shape

(7039, 21)

# 4.Check for inconsistent data, such as inconsistent formatting or spelling variations, and standardize it

In [11]:
data['gender'].value_counts()

Unnamed: 0_level_0,count
gender,Unnamed: 1_level_1
Male,3543
Female,3466
F,12
M,12


In [12]:
data['gender'] = data['gender'].replace({'F':'Female','M ':'Male'})

In [13]:
data.gender.value_counts()

Unnamed: 0_level_0,count
gender,Unnamed: 1_level_1
Male,3555
Female,3478


# 5.Handle missing values in the dataset, deciding on an appropriate strategy

In [14]:
data.isnull().sum()

Unnamed: 0,0
customerID,0
gender,6
SeniorCitizen,0
Partner,1
Dependents,1
tenure,5
PhoneService,1
MultipleLines,2
InternetService,2
OnlineSecurity,1


In [15]:
categorical_cols = ['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','Contract','StreamingMovies','PaymentMethod','Churn']
numercial_cols = ['MonthlyCharges','tenure','TotalCharges']

for col in categorical_cols:
    data[col].fillna(data[col].mode()[0],inplace=True)

for col in numercial_cols:
    data[col].fillna(data[col].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0],inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(),inplace=True)


In [16]:
data.isna().sum()

Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


# 6.Convert columns to the correct data types as needed.


In [None]:
data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure              float64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [17]:
data[['tenure','MonthlyCharges','TotalCharges']] = data[['tenure','MonthlyCharges','TotalCharges']].astype('int64')

In [18]:
data.dtypes


Unnamed: 0,0
customerID,object
gender,object
SeniorCitizen,int64
Partner,object
Dependents,object
tenure,int64
PhoneService,object
MultipleLines,object
InternetService,object
OnlineSecurity,object


# 7. Identify and handle outliers in the data.

In [20]:
threashold = data['MonthlyCharges'].std() * 3.5

In [21]:
# Detecting outliers
outliers = data[(data["MonthlyCharges"] > threashold) | (data["MonthlyCharges"] < -threashold)]
outliers

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,750,29,No
11,7469-LKBCI,Male,0,No,No,32,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),750,326,No
17,9959-WOFKT,Male,0,No,Yes,71,Yes,Yes,Fiber optic,Yes,...,Yes,No,Yes,Yes,Two year,No,Bank transfer (automatic),170,7382,No
19,4183-MYFRB,Female,0,No,No,21,Yes,No,Fiber optic,No,...,Yes,No,No,Yes,Month-to-month,Yes,Electronic check,750,1862,No
26,6467-CHFZW,Male,0,Yes,Yes,47,Yes,Yes,Fiber optic,No,...,No,No,Yes,Yes,Month-to-month,Yes,Electronic check,750,4749,Yes
35,6234-RAAPL,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,Yes,...,No,Yes,Yes,No,Month-to-month,No,Bank transfer (automatic),170,7251,No
41,9489-DEDVP,Female,0,Yes,Yes,70,Yes,Yes,DSL,Yes,...,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),232,4872,No
56,8769-KKTPH,Female,0,Yes,Yes,63,Yes,Yes,Fiber optic,Yes,...,No,No,Yes,Yes,One year,Yes,Credit card (automatic),170,6311,No
59,5954-BDFSG,Male,0,No,No,72,Yes,Yes,Fiber optic,No,...,Yes,Yes,Yes,Yes,Two year,Yes,Credit card (automatic),350,7853,No
74,5630-AHZIL,Female,0,No,Yes,3,Yes,No,DSL,Yes,...,No,Yes,No,Yes,Month-to-month,Yes,Bank transfer (automatic),170,177,No


In [22]:
# Removed outliers
data = data[(data["MonthlyCharges"] < 3.5 * threashold) & (data["MonthlyCharges"] > -threashold)]

In [None]:
data.shape

(7028, 21)

# 8.Perform feature engineering, creating new features that may be relevant to predicting customer churn

In [None]:
data['early_churn'] = (data['tenure'] <= 12) & (data['Churn'] == 'Yes')

In [None]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,early_churn
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,No,No,One year,No,Mailed check,56,1889,No,False
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,Month-to-month,Yes,Mailed check,53,108,Yes,True
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,No,No,One year,No,Bank transfer (automatic),42,1840,No,False
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,Month-to-month,Yes,Electronic check,70,151,Yes,True
5,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,...,No,Yes,Yes,Month-to-month,Yes,Electronic check,99,820,Yes,True


# 9. Normalize or scale the data if necessary

In [27]:
from sklearn.preprocessing import Normalizer

norm = Normalizer()
data[['MonthlyCharges','TotalCharges']] = norm.fit_transform(data[['MonthlyCharges','TotalCharges']])

In [28]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,0.029632,0.999561,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,0.440551,0.897727,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),0.02282,0.99974,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,0.420582,0.907255,Yes
5,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,0.119861,0.992791,Yes


# 10.Split the dataset into training and testing sets for further analysis

In [29]:
from sklearn.model_selection import train_test_split

X = data.drop('Churn',axis=1)
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [30]:
X_train.shape

(4713, 20)

In [31]:
data.shape

(7035, 21)

# 11. Export the cleaned dataset for future analysis or modeling.


In [32]:
X_train.to_csv('Xtrain.csv')
X_test.to_csv('Xtest.csv')