In [72]:
import pandas as pd
from sklearn.impute import KNNImputer
df = pd.read_csv('multi_source_dataset.csv')
# Display missing values before imputation
print("Missing values before imputation:\n", df.isnull().sum())
# Apply KNN imputation
imputer = KNNImputer(n_neighbors=5) # You can change the number of neighbors imputed_data = imputer.fit_transform(df.select_dtypes(include=['number']))

# Convert imputed data back to DataFrame
df_imputed=pd.DataFrame( columns=df.select_dtypes(include=['number'
]).columns)
# If there are categorical columns, add them back
 
df_imputed[df.select_dtypes(exclude=['number']).columns] = df[df.select_dtypes(exclude=['number']).columns]
# Display missing values after imputation
print("\nMissing values after imputation:\n", df_imputed.isnull().sum())



Missing values before imputation:
 Customer_ID      0
Name             0
Email            0
Phone_Number     0
Address          0
Date_of_Birth    0
Gender           0
Loyalty_Score    0
Source_System    0
dtype: int64

Missing values after imputation:
 Loyalty_Score    1000
Customer_ID         0
Name                0
Email               0
Phone_Number        0
Address             0
Date_of_Birth       0
Gender              0
Source_System       0
dtype: int64


In [73]:
import pandas as pd
from sklearn.ensemble import IsolationForest
# Apply Isolation Forest for outlier detection
iso_forest = IsolationForest(contamination=0.05, random_state=42)
df['outlier'] = iso_forest.fit_predict(df.select_dtypes(include=['number']))
# Display outlier counts
print("Outlier detection results:\n", df['outlier'].value_counts())


Outlier detection results:
 outlier
 1    958
-1     42
Name: count, dtype: int64


In [45]:
%pip install imbalanced_learn

In [54]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [55]:
# Define loyalty score categories
def categorize_loyalty(score):
    if score <= 30:
        return 0  # Low
    elif score <= 60:
        return 1  # Medium
    else:
        return 2  # High

# Apply categorization
df["Loyalty_Class"] = df["Loyalty_Score"].apply(categorize_loyalty)

# Drop unnecessary columns (non-numeric identifiers)
columns_to_drop = ["Customer_ID", "Name", "Email", "Phone_Number", "Address", "Date_of_Birth", "Loyalty_Score", "Loyalty_Class"]
X = df.drop(columns=columns_to_drop, errors="ignore")

# Convert categorical features to numerical
X = pd.get_dummies(X, drop_first=True)

# Ensure boolean columns are converted to integers
X = X.astype(np.float32)  # Convert all values to float32 to avoid type issues

y = df["Loyalty_Class"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [74]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# Check class distribution after SMOTE
print("Before SMOTE:")
print(y_train.value_counts())

print("\nAfter SMOTE:")
print(pd.Series(y_train_resampled).value_counts())

Before SMOTE:
Loyalty_Class
2    331
1    240
0    229
Name: count, dtype: int64

After SMOTE:
Loyalty_Class
1    331
2    331
0    331
Name: count, dtype: int64


In [75]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np
# Train Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_resampled, y_train_resampled)
# Predictions
y_pred = dt_model.predict(X_test)

# Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print(f"Decision Tree Accuracy: {accuracy:.4f}\n")
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

Decision Tree Accuracy: 0.3200

Confusion Matrix:
[[21 22 14]
 [24 28  8]
 [23 45 15]]

Classification Report:
              precision    recall  f1-score   support

           0       0.31      0.37      0.34        57
           1       0.29      0.47      0.36        60
           2       0.41      0.18      0.25        83

    accuracy                           0.32       200
   macro avg       0.34      0.34      0.32       200
weighted avg       0.34      0.32      0.31       200



Decision Tree Accuracy: 0.3200

Confusion Matrix:
[[21 22 14]
 [24 28  8]
 [23 45 15]]

Classification Report:
              precision    recall  f1-score   support

           0       0.31      0.37      0.34        57
           1       0.29      0.47      0.36        60
           2       0.41      0.18      0.25        83

    accuracy                           0.32       200
   macro avg       0.34      0.34      0.32       200
weighted avg       0.34      0.32      0.31       200



In [65]:

from sklearn.ensemble import RandomForestClassifier
# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)

# Predictions
y_pred = rf_model.predict(X_test)

# Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Random Forest Accuracy: {accuracy:.4f}\n")
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


Random Forest Accuracy: 0.3200

Confusion Matrix:
[[28 15 14]
 [31 21  8]
 [33 35 15]]

Classification Report:
              precision    recall  f1-score   support

           0       0.30      0.49      0.38        57
           1       0.30      0.35      0.32        60
           2       0.41      0.18      0.25        83

    accuracy                           0.32       200
   macro avg       0.34      0.34      0.32       200
weighted avg       0.34      0.32      0.31       200

