<a href="https://colab.research.google.com/github/usman3721/Hamoye-Internship/blob/main/hamoye_Assignment_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
df=pd.read_csv('/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [8]:
# Convert 'TotalCharges' to numeric, filling missing values with 0
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(0, inplace=True)

# Convert 'Churn' to binary values
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

# Split the data into an 80-20 train-test split
X = df.drop('Churn', axis=1)
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Define categorical and numerical features
categorical_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
                        'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
                        'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']


In [9]:
# Scaling numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numerical_features])
X_test_scaled = scaler.transform(X_test[numerical_features])

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=numerical_features)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=numerical_features)

# One-hot encoding categorical features
encoder = OneHotEncoder(sparse_output=False)
X_train_encoded = encoder.fit_transform(X_train[categorical_features])
X_test_encoded = encoder.transform(X_test[categorical_features])

X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(categorical_features))
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(categorical_features))

# Combine scaled numerical and one-hot encoded categorical features
X_train_final = pd.concat([X_train_scaled_df.reset_index(drop=True), X_train_encoded_df.reset_index(drop=True)], axis=1)
X_test_final = pd.concat([X_test_scaled_df.reset_index(drop=True), X_test_encoded_df.reset_index(drop=True)], axis=1)


In [10]:
# Train a RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=1)
rf_clf.fit(X_train_final, y_train)
rf_pred = rf_clf.predict(X_test_final)

# Train an ExtraTreesClassifier
et_clf = ExtraTreesClassifier(random_state=1)
et_clf.fit(X_train_final, y_train)
et_pred = et_clf.predict(X_test_final)

# Train an XGBoost Classifier
xgb_clf = xgb.XGBClassifier(random_state=1)
xgb_clf.fit(X_train_final, y_train)
xgb_pred = xgb_clf.predict(X_test_final)

# Train a LightGBM Classifier
lgb_clf = lgb.LGBMClassifier(random_state=1)
lgb_clf.fit(X_train_final, y_train)
lgb_pred = lgb_clf.predict(X_test_final)


[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001323 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785


In [11]:
def evaluate_model(model_name, y_true, y_pred):
    print(f"Evaluation for {model_name}")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("Classification Report:")
    print(classification_report(y_true, y_pred))

evaluate_model("Random Forest", y_test, rf_pred)
evaluate_model("Extra Trees", y_test, et_pred)
evaluate_model("XGBoost", y_test, xgb_pred)
evaluate_model("LightGBM", y_test, lgb_pred)


Evaluation for Random Forest
Accuracy: 0.7913
Confusion Matrix:
[[929 132]
 [162 186]]
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.88      0.86      1061
           1       0.58      0.53      0.56       348

    accuracy                           0.79      1409
   macro avg       0.72      0.71      0.71      1409
weighted avg       0.79      0.79      0.79      1409

Evaluation for Extra Trees
Accuracy: 0.7672
Confusion Matrix:
[[916 145]
 [183 165]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.86      0.85      1061
           1       0.53      0.47      0.50       348

    accuracy                           0.77      1409
   macro avg       0.68      0.67      0.67      1409
weighted avg       0.76      0.77      0.76      1409

Evaluation for XGBoost
Accuracy: 0.7935
Confusion Matrix:
[[924 137]
 [154 194]]
Classification Report:
              precision 

In [12]:
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {
    'n_estimators': n_estimators,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'max_features': max_features
}


In [13]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV

# Initialize the ExtraTreesClassifier
extra_trees = ExtraTreesClassifier(random_state=1)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=extra_trees,
    param_distributions=hyperparameter_grid,
    cv=5,
    n_iter=10,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    random_state=1
)


In [14]:
# Assuming X_train and y_train are your training data and labels
random_search.fit(X_train, y_train)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


ValueError: 
All the 50 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py", line 345, in fit
    X, y = self._validate_data(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 584, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py", line 1106, in check_X_y
    X = check_array(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/generic.py", line 1998, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: '4056-QHXHZ'

--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py", line 345, in fit
    X, y = self._validate_data(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 584, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py", line 1106, in check_X_y
    X = check_array(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/generic.py", line 1998, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: '5442-PPTJY'


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector as selector

# Load the dataset
# df = pd.read_csv('telco_customer_churn.csv')

# Preprocessing
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(0)
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

# Split data into features and target
X = df.drop('Churn', axis=1)
y = df['Churn']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Preprocessing pipeline for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, selector(dtype_include='number')),
        ('cat', categorical_transformer, selector(dtype_include='object'))
    ])

# Train the initial ExtraTreesClassifier model with default parameters
initial_model = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', ExtraTreesClassifier(random_state=1))])
initial_model.fit(X_train, y_train)

# Evaluate the initial model
initial_predictions = initial_model.predict(X_test)
initial_accuracy = accuracy_score(y_test, initial_predictions)
print(f"Initial Model Accuracy: {initial_accuracy:.4f}")

# Hyperparameter grid
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {
    'classifier__n_estimators': n_estimators,
    'classifier__min_samples_leaf': min_samples_leaf,
    'classifier__min_samples_split': min_samples_split,
    'classifier__max_features': max_features
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=initial_model,
    param_distributions=hyperparameter_grid,
    cv=5,
    n_iter=10,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    random_state=1
)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Retrieve the best hyperparameters
best_hyperparameters = random_search.best_params_
print("Best Hyperparameters:", best_hyperparameters)

# Train a new ExtraTreesClassifier with the best hyperparameters
optimized_model = random_search.best_estimator_

# Evaluate the optimized model
optimized_predictions = optimized_model.predict(X_test)
optimized_accuracy = accuracy_score(y_test, optimized_predictions)
print(f"Optimized Model Accuracy: {optimized_accuracy:.4f}")

# Compare the accuracy of both models
if optimized_accuracy > initial_accuracy:
    print("The optimized model has higher accuracy than the initial model.")
else:
    print("The initial model has higher accuracy than the optimized model.")


ValueError: Input y contains NaN.