In [1]:
import pandas as pd

In [2]:
# Load the provided CSV file to examine the data structure and contents
file_path = '66c0115acc881_epic6_cx_data_analytics_case_2.csv'
data = pd.read_csv(file_path)

# Display basic information about the dataset and the first few rows to understand its structure
data_info = data.info()
data_head = data.head()

data_info, data_head


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999999 entries, 0 to 999998
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Unnamed: 0    999999 non-null  int64  
 1   id_new        999999 non-null  int64  
 2   MOB_DIFF      999999 non-null  float64
 3   state         999999 non-null  int64  
 4   hour          999999 non-null  int64  
 5   Enqgrp2       981078 non-null  object 
 6   DAY_DIFF      999999 non-null  float64
 7   age_band_new  999999 non-null  int64  
 8   location      999999 non-null  int64  
 9   cbs_band      999999 non-null  int64  
 10  Chan          999999 non-null  int64  
 11  gendr         999999 non-null  int64  
 12  portf         999999 non-null  int64  
dtypes: float64(2), int64(10), object(1)
memory usage: 99.2+ MB


(None,
    Unnamed: 0  id_new  MOB_DIFF  state  hour    Enqgrp2  DAY_DIFF  \
 0      571359  107173      14.0      7    10      Login      27.0   
 1      136632   26718      40.0      4     3        NOC      71.0   
 2     1170439  210826       3.0      7     5  Alternate       3.0   
 3      205833   39780      33.0      0    12   NLP_Fail     103.0   
 4     1326715  236862       3.0      4    11    Enquiry      52.0   
 
    age_band_new  location  cbs_band  Chan  gendr  portf  
 0             3         2         0     1      0      9  
 1             2         2         3     1      1      0  
 2             0         3         3     1      0      9  
 3             4         3         2     1      0      4  
 4             3         2         2     2      1     18  )

In [3]:
# Data Preprocessing

# Drop any unnecessary columns, such as unnamed index columns
data_cleaned = data.drop(columns=['Unnamed: 0'])

# Check for missing values and handle them (e.g., dropping or filling)
missing_values = data_cleaned.isnull().sum()

# Drop rows with missing values in 'Enqgrp2' since this column has a reasonable amount of missing data
data_cleaned = data_cleaned.dropna(subset=['Enqgrp2'])

# Encode the 'Enqgrp2' column as it is categorical
data_cleaned['Enqgrp2'] = data_cleaned['Enqgrp2'].astype('category').cat.codes

# Display the cleaned data information and check for missing values again
cleaned_info = data_cleaned.info()
cleaned_head = data_cleaned.head()
remaining_missing_values = data_cleaned.isnull().sum()

cleaned_info, cleaned_head, remaining_missing_values


<class 'pandas.core.frame.DataFrame'>
Index: 981078 entries, 0 to 999998
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id_new        981078 non-null  int64  
 1   MOB_DIFF      981078 non-null  float64
 2   state         981078 non-null  int64  
 3   hour          981078 non-null  int64  
 4   Enqgrp2       981078 non-null  int8   
 5   DAY_DIFF      981078 non-null  float64
 6   age_band_new  981078 non-null  int64  
 7   location      981078 non-null  int64  
 8   cbs_band      981078 non-null  int64  
 9   Chan          981078 non-null  int64  
 10  gendr         981078 non-null  int64  
 11  portf         981078 non-null  int64  
dtypes: float64(2), int64(9), int8(1)
memory usage: 90.8 MB


(None,
    id_new  MOB_DIFF  state  hour  Enqgrp2  DAY_DIFF  age_band_new  location  \
 0  107173      14.0      7    10       32      27.0             3         2   
 1   26718      40.0      4     3       50      71.0             2         2   
 2  210826       3.0      7     5        1       3.0             0         3   
 3   39780      33.0      0    12       49     103.0             4         3   
 4  236862       3.0      4    11       18      52.0             3         2   
 
    cbs_band  Chan  gendr  portf  
 0         0     1      0      9  
 1         3     1      1      0  
 2         3     1      0      9  
 3         2     1      0      4  
 4         2     2      1     18  ,
 id_new          0
 MOB_DIFF        0
 state           0
 hour            0
 Enqgrp2         0
 DAY_DIFF        0
 age_band_new    0
 location        0
 cbs_band        0
 Chan            0
 gendr           0
 portf           0
 dtype: int64)

In [4]:
# feature selection and data splitting.
from sklearn.model_selection import train_test_split

# Select features and target variable
X = data_cleaned.drop(columns=['Chan'])  # Features (exclude the target variable 'Chan')
y = data_cleaned['Chan']  # Target variable

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets to confirm the split
train_shape = X_train.shape, y_train.shape
test_shape = X_test.shape, y_test.shape

train_shape, test_shape


(((784862, 11), (784862,)), ((196216, 11), (196216,)))

In [5]:
# Train a RandomForest classifier and evaluate its performance
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Predict on the test data
y_pred = rf_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Phone', 'WhatsApp', 'Chatbot'])

accuracy, report


(0.9036877726587027,
 '              precision    recall  f1-score   support\n\n       Phone       0.94      0.58      0.72     27098\n    WhatsApp       0.90      0.96      0.93    122505\n     Chatbot       0.90      0.94      0.92     46613\n\n    accuracy                           0.90    196216\n   macro avg       0.91      0.83      0.86    196216\nweighted avg       0.91      0.90      0.90    196216\n')

In [8]:
#1. Feature Engineering
# Example: Creating a new feature that combines hour and inquiry type
X_train['hour_enqgrp_interaction'] = X_train['hour'] * X_train['Enqgrp2']
X_test['hour_enqgrp_interaction'] = X_test['hour'] * X_test['Enqgrp2']

# Dropping the original 'hour' and 'Enqgrp2' if necessary
# X_train = X_train.drop(columns=['hour', 'Enqgrp2'])
# X_test = X_test.drop(columns=['hour', 'Enqgrp2'])


In [9]:
#2. Hyperparameter Tuning using GridSearchCV
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

# Set up the GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Best parameters found by GridSearchCV
best_params = grid_search.best_params_
best_params


Fitting 3 folds for each of 216 candidates, totalling 648 fits


KeyboardInterrupt: 

In [None]:
#3.Cross-Validation
from sklearn.model_selection import cross_val_score

# Use the best estimator from the grid search
best_rf_model = grid_search.best_estimator_

# Perform cross-validation
cv_scores = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='accuracy')

# Mean cross-validated accuracy
cv_mean = cv_scores.mean()
cv_mean


In [None]:
#4.Final Model Training with Best Parameters

# Train the final model with the best parameters on the full training data
final_rf_model = RandomForestClassifier(**best_params, random_state=42)
final_rf_model.fit(X_train, y_train)

# Predict on the test data
final_y_pred = final_rf_model.predict(X_test)

# Evaluate final performance
final_accuracy = accuracy_score(y_test, final_y_pred)
final_report = classification_report(y_test, final_y_pred, target_names=['Phone', 'WhatsApp', 'Chatbot'])

final_accuracy, final_report


In [None]:
#5.Confusion Matrix

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, final_y_pred)

# Plot the confusion matrix
plt.figure(figsize=(8,6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Phone', 'WhatsApp', 'Chatbot'], yticklabels=['Phone', 'WhatsApp', 'Chatbot'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()


In [None]:
#6.Customer Satisfaction Metrics
# Assuming 'response_time' and 'resolution_time' columns are present or derived
response_time_mean = data_cleaned['response_time'].mean()
resolution_rate = (data_cleaned['resolved'] == 1).mean()  # Assuming a binary 'resolved' column

# Print metrics
print(f"Average Response Time: {response_time_mean:.2f} minutes")
print(f"Resolution Rate: {resolution_rate:.2%}")


In [None]:
#7. Business Opportunity Identification
# Example: Customers likely to need additional services based on model prediction
high_prob_customers = X_test[final_rf_model.predict_proba(X_test)[:, 1] > 0.7]

# Analyze this segment for potential business opportunities
# e.g., check their portfolio or inquiry types to suggest new products
