In [1]:
import pandas as pd
import numpy as np
import os

# 1. Load Dataset
Load the dataset using Python libraries such as Pandas.

In [2]:
current_dir = os.getcwd()
data_folder = os.path.join(current_dir, "data")

train_data = pd.read_csv(os.path.join(data_folder, "churn-bigml-80.csv"))
test_data  = pd.read_csv(os.path.join(data_folder, "churn-bigml-20.csv"))
test_data.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,LA,117,408,No,No,0,184.5,97,31.37,351.6,80,29.89,215.8,90,9.71,8.7,4,2.35,1,False
1,IN,65,415,No,No,0,129.1,137,21.95,228.5,83,19.42,208.8,111,9.4,12.7,6,3.43,4,True
2,NY,161,415,No,No,0,332.9,67,56.59,317.8,97,27.01,160.6,128,7.23,5.4,9,1.46,4,True
3,SC,111,415,No,No,0,110.4,103,18.77,137.3,102,11.67,189.6,105,8.53,7.7,6,2.08,2,False
4,HI,49,510,No,No,0,119.3,117,20.28,215.1,109,18.28,178.7,90,8.04,11.1,1,3.0,1,False


In [3]:
# Basic data inpsection
print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

print("Train data info:")
print(train_data.info())
print("Test data info:")
print(test_data.info())

Train data shape: (2666, 20)
Test data shape: (667, 20)
Train data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2666 entries, 0 to 2665
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   State                   2666 non-null   object 
 1   Account length          2666 non-null   int64  
 2   Area code               2666 non-null   int64  
 3   International plan      2666 non-null   object 
 4   Voice mail plan         2666 non-null   object 
 5   Number vmail messages   2666 non-null   int64  
 6   Total day minutes       2666 non-null   float64
 7   Total day calls         2666 non-null   int64  
 8   Total day charge        2666 non-null   float64
 9   Total eve minutes       2666 non-null   float64
 10  Total eve calls         2666 non-null   int64  
 11  Total eve charge        2666 non-null   float64
 12  Total night minutes     2666 non-null   float64
 13  Total night calls   

Nothing out of the ordinary

# 2. Data Cleaning
Handle missing values and perform data cleaning, if necessary. 

In [4]:

print("Missing values in train_data:\n", train_data.isnull().sum())
print("\nMissing values in test_data:\n", test_data.isnull().sum())

print("\n0 values in train_data:\n", (train_data == 0).sum())
print("\n0 values in test_data:\n", (test_data == 0).sum())

print("\nCategorical values in train_data:\n", train_data.select_dtypes(include=['object']).nunique())
print("\nCategorical values in test_data:\n", test_data.select_dtypes(include=['object']).nunique())

Missing values in train_data:
 State                     0
Account length            0
Area code                 0
International plan        0
Voice mail plan           0
Number vmail messages     0
Total day minutes         0
Total day calls           0
Total day charge          0
Total eve minutes         0
Total eve calls           0
Total eve charge          0
Total night minutes       0
Total night calls         0
Total night charge        0
Total intl minutes        0
Total intl calls          0
Total intl charge         0
Customer service calls    0
Churn                     0
dtype: int64

Missing values in test_data:
 State                     0
Account length            0
Area code                 0
International plan        0
Voice mail plan           0
Number vmail messages     0
Total day minutes         0
Total day calls           0
Total day charge          0
Total eve minutes         0
Total eve calls           0
Total eve charge          0
Total night minutes       0
T

No null values.

Seems to be one or two rows in the training data with 0 minutes/calls/charge at all, will inspect that.

51 states? There should only be 50.

In [5]:
# Find the row in train_data where the value of Total day minutes is 0, to try to find the outlier
outlier_row = train_data[train_data['Total day minutes'] == 0]
print("\nOutlier row in train_data:\n", outlier_row)


Outlier row in train_data:
      State  Account length  Area code International plan Voice mail plan  \
1057    SD              98        415                 No              No   
1100    VT             101        510                 No              No   

      Number vmail messages  Total day minutes  Total day calls  \
1057                      0                0.0                0   
1100                      0                0.0                0   

      Total day charge  Total eve minutes  Total eve calls  Total eve charge  \
1057               0.0              159.6              130             13.57   
1100               0.0              192.1              119             16.33   

      Total night minutes  Total night calls  Total night charge  \
1057                167.1                 88                7.52   
1100                168.8                 95                7.60   

      Total intl minutes  Total intl calls  Total intl charge  \
1057                 6.8     

Looks like two accounts never made day calls, and one other account never made evening calls. That seems normal, so no need to remove anything.

In [6]:
# check unique values in State column
print("\nUnique values in State column of train_data:\n", train_data['State'].unique())
print("\nUnique values in State column of test_data:\n", test_data['State'].unique())


Unique values in State column of train_data:
 ['KS' 'OH' 'NJ' 'OK' 'AL' 'MA' 'MO' 'WV' 'RI' 'IA' 'MT' 'ID' 'VT' 'VA'
 'TX' 'FL' 'CO' 'AZ' 'NE' 'WY' 'IL' 'NH' 'LA' 'GA' 'AK' 'MD' 'AR' 'WI'
 'OR' 'DE' 'IN' 'UT' 'CA' 'SD' 'NC' 'WA' 'MN' 'NM' 'NV' 'DC' 'NY' 'KY'
 'ME' 'MS' 'MI' 'SC' 'TN' 'PA' 'HI' 'ND' 'CT']

Unique values in State column of test_data:
 ['LA' 'IN' 'NY' 'SC' 'HI' 'AK' 'MI' 'ID' 'VA' 'WI' 'MN' 'VT' 'MT' 'MA'
 'KY' 'CO' 'AZ' 'CA' 'WA' 'NE' 'OH' 'MO' 'AL' 'NH' 'NM' 'OR' 'TX' 'MS'
 'WY' 'FL' 'KS' 'NC' 'SD' 'OK' 'CT' 'RI' 'DE' 'UT' 'NV' 'DC' 'ME' 'IL'
 'NJ' 'MD' 'WV' 'PA' 'ND' 'AR' 'TN' 'IA' 'GA']


DC for District of Columbia (Washington D.C.) which makes sense. No need to remove anything here either.

# 3. Encoding
Encode categorical variables using techniques like one-hot or unique integer encoding.