In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Task 1: Import the dataset
df = pd.read_csv("telecom_customer_churn.csv")

# Task 2: Explore the dataset
print(df.head())  # View the first few rows
print(df.info())  # Get information about the dataset
print(df.describe())  # Descriptive statistics

# Task 3: Handle missing values
df.fillna(method='ffill', inplace=True)  # Forward fill missing values

# Task 4: Remove duplicates
df.drop_duplicates(inplace=True)

# Task 5: Check for inconsistent data (standardize text data)
df['Gender'] = df['Gender'].str.lower()  # Example: Standardize gender to lowercase



  Customer ID  Gender  Age Married  Number of Dependents          City  \
0  0002-ORFBO  Female   37     Yes                     0  Frazier Park   
1  0003-MKNFE    Male   46      No                     0      Glendale   
2  0004-TLHLJ    Male   50      No                     0    Costa Mesa   
3  0011-IGKFF    Male   78     Yes                     0      Martinez   
4  0013-EXCHZ  Female   75     Yes                     0     Camarillo   

   Zip Code   Latitude   Longitude  Number of Referrals  ...   Payment Method  \
0     93225  34.827662 -118.999073                    2  ...      Credit Card   
1     91206  34.162515 -118.203869                    0  ...      Credit Card   
2     92627  33.645672 -117.922613                    0  ...  Bank Withdrawal   
3     94553  38.014457 -122.115432                    1  ...  Bank Withdrawal   
4     93010  34.227846 -119.079903                    3  ...      Credit Card   

  Monthly Charge Total Charges  Total Refunds Total Extra Data Charg

In [2]:
# Task 6: Convert columns to the correct data types if needed
df['TotalCharges'] = pd.to_numeric(df['Total Charges'], errors='coerce')  # Convert TotalCharges to numeric

# Task 7: Identify and handle outliers (you can customize this based on your needs)
# For example, if you want to remove outliers from the TotalCharges column:
z_scores = (df['TotalCharges'] - df['Total Charges'].mean()) / df['Total Charges'].std()
df = df[(z_scores.abs() < 3)]

# Task 8: Perform feature engineering (create new features)
df['TenureinMonths'] = df['Tenure in Months'] * 30  # Convert tenure to months

# Task 9: Normalize or scale the data
scaler = StandardScaler()
df[['MonthlyCharge', 'TotalCharges', 'TenureinMonths']] = scaler.fit_transform(df[['Monthly Charge', 'Total Charges', 'Tenure in Months']])

# Task 10: Split the dataset into training and testing sets
X = df.drop('Churn Category', axis=1)
y = df['Churn Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Task 11: Export the cleaned dataset
df.to_csv("Cleaned_Telecom_Customer_Churn.csv", index=False)
