In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

In [2]:
df = pd.read_csv('/home/umair123/unified mentor/Customer Satisfaction Prediction/customer_support_tickets.csv')

In [3]:
len(df)

8469

In [7]:
df= df.dropna()

In [8]:
len(df)

2769

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2769 entries, 2 to 8467
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Ticket ID                     2769 non-null   int64  
 1   Customer Name                 2769 non-null   object 
 2   Customer Email                2769 non-null   object 
 3   Customer Age                  2769 non-null   int64  
 4   Customer Gender               2769 non-null   object 
 5   Product Purchased             2769 non-null   object 
 6   Date of Purchase              2769 non-null   object 
 7   Ticket Type                   2769 non-null   object 
 8   Ticket Subject                2769 non-null   object 
 9   Ticket Description            2769 non-null   object 
 10  Ticket Status                 2769 non-null   object 
 11  Resolution                    2769 non-null   object 
 12  Ticket Priority               2769 non-null   object 
 13  Ticket C

In [10]:
label_encoder = LabelEncoder()
df['Customer Gender'] = label_encoder.fit_transform(df['Customer Gender'])
df['Ticket Type'] = label_encoder.fit_transform(df['Ticket Type'])

df['Ticket Status'] = label_encoder.fit_transform(df['Ticket Status'])
df['Ticket Priority'] = label_encoder.fit_transform(df['Ticket Priority'])
df['Ticket Channel'] = label_encoder.fit_transform(df['Ticket Channel'])

df['Ticket Subject'] = label_encoder.fit_transform(df['Ticket Subject'])


X = df[['Customer Gender', 'Ticket Type', 'Ticket Status', 'Ticket Priority', 'Ticket Channel', 'Ticket Subject',
'Customer Age']]
y = df['Customer Satisfaction Rating']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<h3>Random Forest Regression</h3>

In [12]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 2.2433003732952264
R-squared: -0.1372706311523728


<h3>XG BOOST Regression</h3>

In [17]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define XGBoost Regressor
regressor = xgb.XGBRegressor(eval_metric='rmse')  # Use RMSE instead of RMSLE for real-valued targets

# Set up a better search grid
param_grid = {
    "max_depth": [4, 5, 6],  # Reducing depth to prevent overfitting
    "n_estimators": [200, 300, 400],  # Lower values for faster training
    "learning_rate": [0.03, 0.05],  # Adjusted to improve learning
    "subsample": [0.8],  
    "colsample_bytree": [0.8],  
    "reg_lambda": [1]  
}

# GridSearchCV with reduced complexity
search = GridSearchCV(regressor, param_grid, cv=3, scoring="neg_mean_squared_error", n_jobs=-1)
search.fit(X_train_scaled, y_train)

# Predictions
y_pred = search.predict(X_test_scaled)

# Metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')


Mean Squared Error: 2.0491852496962584
R-squared: -0.0388614159801135


<h3>Support vector Regression</h3>


In [18]:
from sklearn.svm import SVR

support_vector = SVR(C=1 , epsilon=0.2)

support_vector.fit(X_train,y_train)

In [19]:
y_pred = support_vector.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 1.9816507434864366
R-squared: -0.004623909752185851


<h3>Linear Regression</h3>


In [20]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)

In [23]:
y_pred = reg.predict(X_test)
mse= mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 1.971833489960776
R-squared: 0.00035307605239931483


<h3>Ridge Regression</h3>

In [24]:
from sklearn.linear_model import Ridge
clf = Ridge(alpha=1.0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
mse= mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 1.9718324171005033
R-squared: 0.00035361995302629623
