In [198]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [199]:
x_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')

In [200]:
print("Dataset:")
print(x_train.head())  # This will print the first 5 rows of the dataset

# Check the number of rows in x_train
num_rows_x_train = x_train.shape[0]  # This returns the number of rows

print(f"The number of rows in x_train is: {num_rows_x_train}")

Dataset:
   CustomerId    Surname  CreditScore Geography  Gender  Age  Tenure  \
0    15799217   Zetticci          791   Germany  Female   35       7   
1    15748986    Bischof          705   Germany    Male   42       8   
2    15722004     Hsiung          543    France  Female   31       4   
3    15780966  Pritchard          709    France  Female   32       2   
4    15636731      Ts'ai          714   Germany  Female   36       1   

     Balance  NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  
0   52436.20              1          1               0        161051.75  
1  166685.92              2          1               1         55313.51  
2  138317.94              1          0               0         61843.73  
3       0.00              2          0               0        109681.29  
4  101609.01              2          1               1           447.73  
The number of rows in x_train is: 6499


In [201]:
# Remove irrelevant columns
x_train.drop(['Surname', 'CustomerId'], axis=1, inplace=True)

In [202]:
print("Dataset:")
print(x_train.head())  # This will print the first 5 rows of the dataset

Dataset:
   CreditScore Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0          791   Germany  Female   35       7   52436.20              1   
1          705   Germany    Male   42       8  166685.92              2   
2          543    France  Female   31       4  138317.94              1   
3          709    France  Female   32       2       0.00              2   
4          714   Germany  Female   36       1  101609.01              2   

   HasCrCard  IsActiveMember  EstimatedSalary  
0          1               0        161051.75  
1          1               1         55313.51  
2          0               0         61843.73  
3          0               0        109681.29  
4          1               1           447.73  


In [203]:
# Check the number of features
num_features = x_train.shape[1]
print(f'The dataset currently has {num_features} features.')

The dataset currently has 10 features.


In [204]:
# Replace '?' with NaN to handle missing values
x_train.replace('?', np.nan, inplace=True)

# Check the number of rows in x_train
num_rows_x_train = x_train.shape[0]  # This returns the number of rows

print(f"The number of rows in x_train is: {num_rows_x_train}")

The number of rows in x_train is: 6499


In [205]:
# Display missing values in each column
missing_values = x_train.isnull().sum()
print(f'Missing values in each column:\n{missing_values}')

Missing values in each column:
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
dtype: int64


In [206]:
print(" Dataset:")
print(x_train.head())  # This will print the first 5 rows of the dataset

 Dataset:
   CreditScore Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0          791   Germany  Female   35       7   52436.20              1   
1          705   Germany    Male   42       8  166685.92              2   
2          543    France  Female   31       4  138317.94              1   
3          709    France  Female   32       2       0.00              2   
4          714   Germany  Female   36       1  101609.01              2   

   HasCrCard  IsActiveMember  EstimatedSalary  
0          1               0        161051.75  
1          1               1         55313.51  
2          0               0         61843.73  
3          0               0        109681.29  
4          1               1           447.73  


In [207]:
# Remove extra spaces and convert categorical values to lowercase
if 'Gender' in x_train.columns:
    x_train['Gender'] = x_train['Gender'].str.strip().str.lower()

In [208]:
print(" Dataset:")
print(x_train.head())  # This will print the first 5 rows of the dataset

# Check the number of rows in x_train
num_rows_x_train = x_train.shape[0]  # This returns the number of rows

print(f"The number of rows in x_train is: {num_rows_x_train}")

 Dataset:
   CreditScore Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0          791   Germany  female   35       7   52436.20              1   
1          705   Germany    male   42       8  166685.92              2   
2          543    France  female   31       4  138317.94              1   
3          709    France  female   32       2       0.00              2   
4          714   Germany  female   36       1  101609.01              2   

   HasCrCard  IsActiveMember  EstimatedSalary  
0          1               0        161051.75  
1          1               1         55313.51  
2          0               0         61843.73  
3          0               0        109681.29  
4          1               1           447.73  
The number of rows in x_train is: 6499


In [209]:
# Check for any non-numeric columns
non_numeric_columns = x_train.select_dtypes(include=['object']).columns
print(f'Non-numeric columns:\n{non_numeric_columns}')

Non-numeric columns:
Index(['Geography', 'Gender'], dtype='object')


In [210]:
# One-Hot Encode categorical variables
categorical_cols = ['Gender', 'Geography']
x_train = pd.get_dummies(x_train, columns=categorical_cols, drop_first=False)

In [211]:
print(" Dataset:")
print(x_train.head())  # This will print the first 5 rows of the dataset

 Dataset:
   CreditScore  Age  Tenure    Balance  NumOfProducts  HasCrCard  \
0          791   35       7   52436.20              1          1   
1          705   42       8  166685.92              2          1   
2          543   31       4  138317.94              1          0   
3          709   32       2       0.00              2          0   
4          714   36       1  101609.01              2          1   

   IsActiveMember  EstimatedSalary  Gender_female  Gender_male  \
0               0        161051.75           True        False   
1               1         55313.51          False         True   
2               0         61843.73           True        False   
3               0        109681.29           True        False   
4               1           447.73           True        False   

   Geography_France  Geography_Germany  Geography_Spain  
0             False               True            False  
1             False               True            False  
2         

In [212]:
# Check if the data is now fully numeric
print("Data types after processing:\n", x_train.dtypes)

Data types after processing:
 CreditScore            int64
Age                    int64
Tenure                 int64
Balance              float64
NumOfProducts          int64
HasCrCard              int64
IsActiveMember         int64
EstimatedSalary      float64
Gender_female           bool
Gender_male             bool
Geography_France        bool
Geography_Germany       bool
Geography_Spain         bool
dtype: object


In [213]:
# Print the first few rows of the cleansed dataset
print("Cleansed Dataset:")
print(x_train.head())  # This will print the first 5 rows of the dataset

Cleansed Dataset:
   CreditScore  Age  Tenure    Balance  NumOfProducts  HasCrCard  \
0          791   35       7   52436.20              1          1   
1          705   42       8  166685.92              2          1   
2          543   31       4  138317.94              1          0   
3          709   32       2       0.00              2          0   
4          714   36       1  101609.01              2          1   

   IsActiveMember  EstimatedSalary  Gender_female  Gender_male  \
0               0        161051.75           True        False   
1               1         55313.51          False         True   
2               0         61843.73           True        False   
3               0        109681.29           True        False   
4               1           447.73           True        False   

   Geography_France  Geography_Germany  Geography_Spain  
0             False               True            False  
1             False               True            False  
2 

In [216]:
# Check for missing values in y_train
missing_values_y_train = y_train.isnull().sum()

print("Missing values in y_train:")
print(missing_values_y_train)

# Check the number of rows in y_train
num_rows_y_train = y_train.shape[0]  # This returns the number of rows

print(f"The number of rows in y_train is: {num_rows_y_train}")

Missing values in y_train after removing CustomerId:
Exited    0
dtype: int64
The number of rows in y_train after removing CustomerId is: 6499


In [227]:
# Cleansed X_train dataset
print("Cleansed Dataset:")
print(x_train.head())  # This is your cleansed X_train

# Drop the 'CustomerId' column from y_train
y_train_cleaned = y_train.drop(columns=['CustomerId'])

# Print y_train_cleaned (first 5 samples)
print("y_train after removing CustomerId (first 5 samples):")
print(y_train_cleaned.head())

# Combine cleansed X_train with y_train_cleaned along the columns
combined_data = pd.concat([x_train, y_train_cleaned], axis=1)

# Print the combined dataset
print("Combined Dataset:")
print(combined_data.head())

# Export the combined DataFrame to a CSV file
combined_data.to_csv('combined_dataset.csv', index=False)

print("The combined dataset has been exported to 'combined_dataset.csv'.")

# Convert the DataFrame to a JSON string
json_data = combined_data.to_json(orient='records')

# Write the JSON string to a file
with open('output_file.json', 'w') as json_file:
    json_file.write(json_data)

print("CSV has been converted to JSON and saved as 'combined_data.json'")


Cleansed Dataset:
   CreditScore  Age  Tenure    Balance  NumOfProducts  HasCrCard  \
0          791   35       7   52436.20              1          1   
1          705   42       8  166685.92              2          1   
2          543   31       4  138317.94              1          0   
3          709   32       2       0.00              2          0   
4          714   36       1  101609.01              2          1   

   IsActiveMember  EstimatedSalary  Gender_female  Gender_male  \
0               0        161051.75           True        False   
1               1         55313.51          False         True   
2               0         61843.73           True        False   
3               0        109681.29           True        False   
4               1           447.73           True        False   

   Geography_France  Geography_Germany  Geography_Spain  
0             False               True            False  
1             False               True            False  
2 

In [221]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(x_train, y_train['Exited'], test_size=0.2, random_state=42) 

In [222]:
# Model Selection: Random Forest
model = RandomForestClassifier(n_estimators=10, random_state=42)

In [223]:
# Model Training
model.fit(X_train, y_train)

In [224]:
# Predict on validation data
y_val_pred = model.predict(X_val)


In [225]:
# Print to confirm
print("y_train (first 5 samples after removing CustomerId):")
print(y_train[:5])

# Correct prediction method
y_val_pred = model.predict(X_val)

# Now check the shape again to confirm it's 1D
print(f"Shape of y_val_pred: {y_val_pred.shape}")


y_train (first 5 samples after removing CustomerId):
2851    0
4918    0
5690    0
4262    0
4975    0
Name: Exited, dtype: int64
Shape of y_val_pred: (1300,)


In [226]:
# Evaluate the model's performance on validation data
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))

Validation Accuracy: 0.8538461538461538
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.97      0.91      1022
           1       0.78      0.44      0.56       278

    accuracy                           0.85      1300
   macro avg       0.82      0.70      0.74      1300
weighted avg       0.85      0.85      0.84      1300

