In [129]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectFromModel
from imblearn.over_sampling import SMOTE

In [130]:
# Load the dataset
file_path = 'C:\\Users\\91964\\Desktop\\csp\\customer_support_tickets.csv'
data = pd.read_csv(file_path)
print(data.columns)
print(data.head())

Index(['Ticket ID', 'Customer Name', 'Customer Email', 'Customer Age',
       'Customer Gender', 'Product Purchased', 'Date of Purchase',
       'Ticket Type', 'Ticket Subject', 'Ticket Description', 'Ticket Status',
       'Resolution', 'Ticket Priority', 'Ticket Channel',
       'First Response Time', 'Time to Resolution',
       'Customer Satisfaction Rating'],
      dtype='object')
   Ticket ID        Customer Name              Customer Email  Customer Age  \
0          1        Marisa Obrien  carrollallison@example.com            32   
1          2         Jessica Rios    clarkeashley@example.com            42   
2          3  Christopher Robbins   gonzalestracy@example.com            48   
3          4     Christina Dillon    bradleyolson@example.org            27   
4          5    Alexander Carroll     bradleymark@example.com            67   

  Customer Gender Product Purchased Date of Purchase      Ticket Type  \
0           Other        GoPro Hero       22-03-2021  Technical

In [131]:
# Drop irrelevant columns
data_cleaned = data.drop(columns=['Ticket ID', 'Customer Name', 'Customer Email', 'Ticket Description'])

In [132]:
# Drop rows with missing target values
data_cleaned = data_cleaned.dropna(subset=['Customer Satisfaction Rating'])

In [133]:
# Convert target to binary (Satisfied = 1 if rating >= 3, else 0)
data_cleaned['Customer Satisfaction Rating'] = (data_cleaned['Customer Satisfaction Rating'] >= 3).astype(int)

In [134]:
# Encode categorical features
le = LabelEncoder()
categorical_cols = ['Customer Gender', 'Product Purchased', 'Ticket Type', 'Ticket Subject',
                    'Ticket Status', 'Resolution', 'Ticket Priority', 'Ticket Channel']
for col in categorical_cols:
    data_cleaned[col] = le.fit_transform(data_cleaned[col].astype(str))

In [135]:
# Impute missing values
imputer = SimpleImputer(strategy='most_frequent')
data_cleaned[['First Response Time', 'Time to Resolution']] = imputer.fit_transform(data_cleaned[['First Response Time', 'Time to Resolution']])

In [137]:
# Convert date/time to UNIX timestamp (seconds since epoch)
data_cleaned['Date of Purchase'] = pd.to_datetime(data_cleaned['Date of Purchase'], errors='coerce').astype('int64') // 10**9
data_cleaned['First Response Time'] = pd.to_datetime(data_cleaned['First Response Time'], errors='coerce').astype('int64') // 10**9
data_cleaned['Time to Resolution'] = pd.to_datetime(data_cleaned['Time to Resolution'], errors='coerce').astype('int64') // 10**9

  data_cleaned['Date of Purchase'] = pd.to_datetime(data_cleaned['Date of Purchase'], errors='coerce').astype('int64') // 10**9


In [138]:
# Split data into train and test sets
X = data_cleaned.drop('Customer Satisfaction Rating', axis=1)
y = data_cleaned['Customer Satisfaction Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [139]:
# Handle class imbalance
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [140]:
# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [141]:
# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'class_weight': [None, 'balanced']
}

In [142]:
# build model
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

1350 fits failed out of a total of 4050.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1044 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\91964\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\91964\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\91964\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\91964\AppData\Roaming\Python\Python312\site-packages\sklearn\utils\_param_val

In [144]:
# Best model from grid search
best_model = grid_search.best_estimator_
# Predict on test data
y_pred = best_model.predict(X_test)
# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(report) 

Accuracy: 0.5433
              precision    recall  f1-score   support

           0       0.43      0.39      0.41       226
           1       0.61      0.65      0.63       328

    accuracy                           0.54       554
   macro avg       0.52      0.52      0.52       554
weighted avg       0.54      0.54      0.54       554

