In [2]:
# Part B, Task 1, a, Clean the dataset by handling missing values and removing outliers as needed.

import pandas as pd

# Load the dataset
file_path = 'patient.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
df.head()


Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8068 entries, 0 to 8067
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               8068 non-null   int64  
 1   Gender           8068 non-null   object 
 2   Ever_Married     7928 non-null   object 
 3   Age              8068 non-null   int64  
 4   Graduated        7990 non-null   object 
 5   Profession       7944 non-null   object 
 6   Work_Experience  7239 non-null   float64
 7   Spending_Score   8068 non-null   object 
 8   Family_Size      7733 non-null   float64
 9   Var_1            7992 non-null   object 
 10  Segmentation     8068 non-null   object 
dtypes: float64(2), int64(2), object(7)
memory usage: 693.5+ KB


In [6]:
# Check for missing values in the dataset
missing_values = df.isnull().sum()

# Display the missing values count for each column
missing_values


ID                   0
Gender               0
Ever_Married       140
Age                  0
Graduated           78
Profession         124
Work_Experience    829
Spending_Score       0
Family_Size        335
Var_1               76
Segmentation         0
dtype: int64

In [5]:
df.describe(include='all')

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
count,8068.0,8068,7928,8068.0,7990,7944,7239.0,8068,7733.0,7992,8068
unique,,2,2,,2,9,,3,,7,4
top,,Male,Yes,,Yes,Artist,,Low,,Cat_6,D
freq,,4417,4643,,4968,2516,,4878,,5238,2268
mean,463479.214551,,,43.466906,,,2.641663,,2.850123,,
std,2595.381232,,,16.711696,,,3.406763,,1.531413,,
min,458982.0,,,18.0,,,0.0,,1.0,,
25%,461240.75,,,30.0,,,0.0,,2.0,,
50%,463472.5,,,40.0,,,1.0,,3.0,,
75%,465744.25,,,53.0,,,4.0,,4.0,,


In [7]:
# Cleaning the dataset

# Handle these missing values

# Fill missing values for categorical columns with the mode
df['Ever_Married'].fillna(df['Ever_Married'].mode()[0], inplace=True)
df['Graduated'].fillna(df['Graduated'].mode()[0], inplace=True)
df['Profession'].fillna(df['Profession'].mode()[0], inplace=True)
df['Var_1'].fillna(df['Var_1'].mode()[0], inplace=True)

# Fill missing values for numerical columns with the median
df['Work_Experience'].fillna(df['Work_Experience'].median(), inplace=True)
df['Family_Size'].fillna(df['Family_Size'].median(), inplace=True)

# Verify if all missing values have been handled
df.isnull().sum()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Ever_Married'].fillna(df['Ever_Married'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Graduated'].fillna(df['Graduated'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermedia

ID                 0
Gender             0
Ever_Married       0
Age                0
Graduated          0
Profession         0
Work_Experience    0
Spending_Score     0
Family_Size        0
Var_1              0
Segmentation       0
dtype: int64

In [8]:
# Handle outliers

# Define a function to identify outliers using IQR
def find_outliers_iqr(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data < lower_bound) | (data > upper_bound)]

# Check for outliers in "Age", "Work_Experience", and "Family_Size"
outliers_age = find_outliers_iqr(df['Age'])
outliers_work_experience = find_outliers_iqr(df['Work_Experience'])
outliers_family_size = find_outliers_iqr(df['Family_Size'])

# Count the number of outliers in each column
outliers_counts = {
    'Age': len(outliers_age),
    'Work_Experience': len(outliers_work_experience),
    'Family_Size': len(outliers_family_size)
}

outliers_counts

{'Age': 71, 'Work_Experience': 189, 'Family_Size': 94}

In [9]:
# Remove outliers
# Define a function to remove outliers based on IQR
def remove_outliers_iqr(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data >= lower_bound) & (data <= upper_bound)]

# Remove outliers from "Age", "Work_Experience", and "Family_Size"
df['Age'] = remove_outliers_iqr(df['Age'])
df['Work_Experience'] = remove_outliers_iqr(df['Work_Experience'])
df['Family_Size'] = remove_outliers_iqr(df['Family_Size'])

# Drop rows with missing values caused by the removal of outliers
df.dropna(inplace=True)

# Display the shape of the dataset after removing outliers
df.shape


(7720, 11)

In [10]:
# Part B, Task 1, b, b.	Perform feature scaling or normalization

from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Select numerical columns for scaling
numerical_cols = ['Age', 'Work_Experience', 'Family_Size']

# Apply standardization (z-score scaling) to the numerical columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Display the first few rows of the scaled data
df.head()


Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,-1.308281,No,Healthcare,-0.427585,Low,0.86535,Cat_4,D
1,462643,Female,Yes,-0.326991,Yes,Engineer,-0.427585,Average,0.139918,Cat_4,A
2,466315,Female,Yes,1.451597,Yes,Engineer,-0.427585,Low,-1.310945,Cat_6,B
3,461735,Male,Yes,1.451597,Yes,Lawyer,-0.770685,High,-0.585513,Cat_6,B
4,462669,Female,Yes,-0.204329,Yes,Entertainment,-0.427585,High,2.316213,Cat_6,A


In [11]:
# Part B, Task 1, c, c.	Encode categorical variables appropriately. (1 mark)

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder for the target variable 'Segmentation'
label_encoder = LabelEncoder()

# Apply label encoding to the 'Segmentation' column
df['Segmentation'] = label_encoder.fit_transform(df['Segmentation'])

# Apply one-hot encoding to the remaining categorical columns
df_encoded = pd.get_dummies(df, columns=['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score', 'Var_1'], drop_first=True)

# Display the first few rows of the encoded dataset
df_encoded.head()


Unnamed: 0,ID,Age,Work_Experience,Family_Size,Segmentation,Gender_Male,Ever_Married_Yes,Graduated_Yes,Profession_Doctor,Profession_Engineer,...,Profession_Lawyer,Profession_Marketing,Spending_Score_High,Spending_Score_Low,Var_1_Cat_2,Var_1_Cat_3,Var_1_Cat_4,Var_1_Cat_5,Var_1_Cat_6,Var_1_Cat_7
0,462809,-1.308281,-0.427585,0.86535,3,True,False,False,False,False,...,False,False,False,True,False,False,True,False,False,False
1,462643,-0.326991,-0.427585,0.139918,0,False,True,True,False,True,...,False,False,False,False,False,False,True,False,False,False
2,466315,1.451597,-0.427585,-1.310945,1,False,True,True,False,True,...,False,False,False,True,False,False,False,False,True,False
3,461735,1.451597,-0.770685,-0.585513,1,True,True,True,False,False,...,True,False,True,False,False,False,False,False,True,False
4,462669,-0.204329,-0.427585,2.316213,0,False,True,True,False,False,...,False,False,True,False,False,False,False,False,True,False


In [12]:
# Part B, Task 1, d, d.	Split the dataset into training and testing sets. 

from sklearn.model_selection import train_test_split

# Define the features (X) and the target (y)
X = df_encoded.drop(columns=['ID', 'Segmentation'])
y = df_encoded['Segmentation']

# Split the dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of the training and testing sets
(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


((6176, 22), (1544, 22), (6176,), (1544,))

In [13]:
# Part B, Task 2, a, Select an appropriate classification algorithm (e.g., Logistic Regression, Random Forest, Support Vector Machine) to predict the target categorical variable. Justify your choice. 

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Define the hyperparameters grid for tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Perform grid search with cross-validation for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

best_params


Fitting 5 folds for each of 24 candidates, totalling 120 fits


{'max_depth': 10,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 200}

In [14]:
# Part B, Task 2, b, Implement hyperparameter tuning by conducting a grid search or random search to optimize model parameters. Clearly outline the hyperparameters you tuned and the rationale behind them. 

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Define the hyperparameters grid for tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Perform grid search with cross-validation for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

print("Best Parameters:", best_params)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}


In [15]:
# Part B, Task 2, c, Build the classification model using the training data. Explain the process and provide code snippets.

# Step 1
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


In [16]:
# Step 2

# Initialize the Random Forest model with the best hyperparameters
rf_model = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=2, min_samples_leaf=1, random_state=42)

# Fit the model to the training data
rf_model.fit(X_train, y_train)


In [17]:
# Step 3

# Predict on the test data
y_pred = rf_model.predict(X_test)


In [18]:
# Step 4

# Evaluate the model using accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Generate a classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 51.36%
Classification Report:
              precision    recall  f1-score   support

           0       0.45      0.47      0.46       354
           1       0.39      0.30      0.34       371
           2       0.53      0.56      0.54       390
           3       0.63      0.69      0.66       429

    accuracy                           0.51      1544
   macro avg       0.50      0.51      0.50      1544
weighted avg       0.50      0.51      0.51      1544

Confusion Matrix:
[[167  66  46  75]
 [ 79 112 129  51]
 [ 45  80 219  46]
 [ 82  30  22 295]]


In [19]:
# Part B, Task 3, c, mplement k-fold cross-validation (e.g., 5-fold or 10-fold) to assess the model's generalization performance. 

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Initialize the Random Forest classifier with the best hyperparameters
rf_model = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=2, min_samples_leaf=1, random_state=42)

# Perform 5-fold cross-validation
cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='accuracy')

# Print the cross-validation scores for each fold and the mean accuracy
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean() * 100:.2f}%")
print(f"Standard Deviation: {cv_scores.std() * 100:.2f}%")


Cross-validation scores: [0.49481865 0.49287565 0.49158031 0.49028497 0.49158031]
Mean Accuracy: 49.22%
Standard Deviation: 0.15%
