In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Load the dataset
data = pd.read_csv('customer_churn_large_dataset.csv')

In [3]:
data

Unnamed: 0,CustomerID,Name,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,1,Customer_1,63,Male,Los Angeles,17,73.36,236,0
1,2,Customer_2,62,Female,New York,1,48.76,172,0
2,3,Customer_3,24,Female,Los Angeles,5,85.47,460,0
3,4,Customer_4,36,Female,Miami,3,97.94,297,1
4,5,Customer_5,46,Female,Miami,19,58.14,266,0
...,...,...,...,...,...,...,...,...,...
99995,99996,Customer_99996,33,Male,Houston,23,55.13,226,1
99996,99997,Customer_99997,62,Female,New York,19,61.65,351,0
99997,99998,Customer_99998,64,Male,Chicago,17,96.11,251,1
99998,99999,Customer_99999,51,Female,New York,20,49.25,434,1


In [4]:
data.shape

(100000, 9)

In [5]:
data.columns

Index(['CustomerID', 'Name', 'Age', 'Gender', 'Location',
       'Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB',
       'Churn'],
      dtype='object')

In [6]:
data.dtypes

CustomerID                      int64
Name                           object
Age                             int64
Gender                         object
Location                       object
Subscription_Length_Months      int64
Monthly_Bill                  float64
Total_Usage_GB                  int64
Churn                           int64
dtype: object

In [8]:
data.isnull().sum()

CustomerID                    0
Name                          0
Age                           0
Gender                        0
Location                      0
Subscription_Length_Months    0
Monthly_Bill                  0
Total_Usage_GB                0
Churn                         0
dtype: int64

In [9]:
data.describe()

Unnamed: 0,CustomerID,Age,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,50000.5,44.02702,12.4901,65.053197,274.39365,0.49779
std,28867.657797,15.280283,6.926461,20.230696,130.463063,0.499998
min,1.0,18.0,1.0,30.0,50.0,0.0
25%,25000.75,31.0,6.0,47.54,161.0,0.0
50%,50000.5,44.0,12.0,65.01,274.0,0.0
75%,75000.25,57.0,19.0,82.64,387.0,1.0
max,100000.0,70.0,24.0,100.0,500.0,1.0


In [10]:
# Drop unnecessary columns
data = data.drop(columns=['CustomerID', 'Name'])

In [11]:
# Convert categorical variables to numerical using one-hot encoding
data = pd.get_dummies(data, columns=['Gender', 'Location'], drop_first=True)

In [12]:
# Split data into features (X) and target (y)
X = data.drop(columns=['Churn'])
y = data['Churn']

In [13]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
# Model Building: RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [15]:
# Example new customer data
new_customer_data = pd.DataFrame({
    'Age': [45],
    'Gender_Male': [1],  # 'Male' encoded as 1
    'Location_Los Angeles': [0],  # 'Los Angeles' encoded as 1
    'Location_Miami': [0],  # 'Miami' encoded as 1
    'Location_New York': [0],  # 'New York' encoded as 1
    'Location_Houston': [0],  # 'Houston' encoded as 1
    'Subscription_Length_Months': [6],
    'Monthly_Bill': [60.0],
    'Total_Usage_GB': [250]
})

In [16]:
# Reorder the columns to match the order during training
new_customer_data = new_customer_data[X_train.columns]

In [17]:
# Make predictions using the pre-trained rf_model
new_customer_churn = rf_model.predict(new_customer_data)

In [18]:
print("New customer churn prediction:", new_customer_churn)

New customer churn prediction: [1]
