In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [None]:
# Load train and test data
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

ids = test_data["ID"]

train_data.dropna(inplace=True)

# For testing data
test_data.dropna(inplace=True)

# Drop irrelevant columns from train and test data
train_data = train_data.drop(['ID', 'Profession', 'Var_1'], axis=1)
test_data = test_data.drop(['ID', 'Profession', 'Var_1'], axis=1)

# Mapping for categorical columns
gender_mapping = {'Male': 0, 'Female': 1}
married_mapping = {'No': 0, 'Yes': 1}
graduated_mapping = {'No': 0, 'Yes': 1}

# Apply mappings to the respective columns
train_data['Gender'] = train_data['Gender'].map(gender_mapping)
train_data['Ever_Married'] = train_data['Ever_Married'].map(married_mapping)
train_data['Graduated'] = train_data['Graduated'].map(graduated_mapping)

test_data['Gender'] = test_data['Gender'].map(gender_mapping)
test_data['Ever_Married'] = test_data['Ever_Married'].map(married_mapping)
test_data['Graduated'] = test_data['Graduated'].map(graduated_mapping)

# Ordinal encoding for Spending_Score
ordinal_encoder = OrdinalEncoder(categories=[['Low', 'Average', 'High']])
train_data["Spending_Score"] = ordinal_encoder.fit_transform(train_data[["Spending_Score"]])
test_data["Spending_Score"] = ordinal_encoder.transform(test_data[["Spending_Score"]])

# Drop 'Segmentation' column from training data
y_train = train_data["Segmentation"]
X_train = train_data.drop('Segmentation', axis=1)

# Perform PCA for dimensionality reduction
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
pca = PCA(n_components=3)
X_train_pca = pca.fit_transform(X_train_scaled)

# Get original column names for plotting PCA loadings
columns = X_train.columns

# With Standardization
plt.plot(1, 2, 2)
plt.barh(columns, pca.components_[0])
plt.barh(columns, pca.components_[1])
plt.barh(columns, pca.components_[2])
plt.title('PCA Training Loadings with Standardization (3 Components)')
plt.legend(['Component 1', 'Component 2', 'Component 3'])

plt.tight_layout()
plt.show()


In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# KMeans clustering on training data with explicit n_init parameter
kmeans = KMeans(n_clusters=4, n_init=10, random_state=42)
cluster_labels = kmeans.fit_predict(X_train_pca)

# Visualize 3D clusters
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot for 3 principal components with cluster labels
scatter = ax.scatter(
    X_train_pca[:, 0],  # PC1
    X_train_pca[:, 1],  # PC2
    X_train_pca[:, 2],  # PC3
    c=cluster_labels,    # Use the cluster labels for color differentiation
    cmap='viridis'
)

ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
ax.set_title('KMeans Clustering on Training Data')

# Add a colorbar
cbar = plt.colorbar(scatter)
cbar.set_label('Cluster')

plt.show()


In [None]:
# Train regression models for each cluster
logistic_models = {}
for label in range(4):
    cluster_data = X_train.iloc[cluster_labels == label]
    cluster_indices = np.where(cluster_labels == label)[0]
    print("\n")
    cluster_target = y_train[cluster_labels == label]
    print(cluster_data)
    print("\n")
    print(cluster_target)
    logistic_reg = LogisticRegression(penalty='l1', solver='liblinear')
    logistic_reg.fit(cluster_data, cluster_target)
    logistic_models[label] = logistic_reg

# Apply PCA and KMeans to test data
X_test_scaled = scaler.transform(test_data)
X_test_pca = pca.transform(X_test_scaled)
test_cluster_labels = kmeans.predict(X_test_pca)


test_columns = test_data.columns

plt.plot(1, 2, 2)
plt.barh(test_columns, pca.components_[0])
plt.barh(test_columns, pca.components_[1])
plt.barh(test_columns, pca.components_[2])
plt.title('PCA Testing Loadings with Standardization (3 Components)')
plt.legend(['Component 1', 'Component 2', 'Component 3'])

plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot for 3 principal components with cluster labels
scatter = ax.scatter(
    X_test_pca[:, 0],  # PC1
    X_test_pca[:, 1],  # PC2
    X_test_pca[:, 2],  # PC3
    c=test_cluster_labels,    # Use the cluster labels for color differentiation
    cmap='viridis'
)

ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
ax.set_title('KMeans Clustering on Testing Data')

# Add a colorbar
cbar = plt.colorbar(scatter)
cbar.set_label('Cluster')

plt.show()

In [None]:
# Predict 'Segmentation' for test data using cluster-specific models
test_predictions = []
for idx, label in enumerate(test_cluster_labels):
    test_data_cluster = test_data.iloc[[idx]]
    cluster_model = logistic_models[label]
    test_pred = cluster_model.predict(test_data_cluster)
    test_predictions.append(test_pred[0])

for id_val, pred in zip(ids, test_predictions):
    print(f"ID: {id_val}, Predicted Segmentation: {pred}")

In [None]:
# Code for non-PCA Approach

import pandas as pd
import numpy as np
import csv
import os
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D

In [None]:
desktop_path = os.path.join(os.path.expanduser("~"), "Desktop")
file_path = os.path.join(desktop_path, "Train.csv")
with open(file_path, mode='r') as csv_file:
    read = csv.reader(csv_file)
    matrix = [row for row in read]

training_data_raw = pd.DataFrame(matrix[1:], columns=matrix[0])

#training_data_raw = pd.read_csv("Train.csv")

training_data_raw.drop('ID', axis=1, inplace=True)
print(training_data_raw.head())
print("Proportion of Missing Values in Dataset:",(sum(training_data_raw["Gender"] == "")+sum(training_data_raw["Ever_Married"] == "")+sum(training_data_raw["Age"] == "")+sum(training_data_raw["Graduated"] == "")+sum(training_data_raw["Profession"] == "")+sum(training_data_raw["Work_Experience"] == "")+sum(training_data_raw["Spending_Score"] == "")+sum(training_data_raw["Family_Size"] == "")+sum(training_data_raw["Var_1"] == "")+sum(training_data_raw["Segmentation"] ==""))/(training_data_raw.shape[0]))
original_length = training_data_raw.shape[0]
#We will retain ~80% of the dataset and remove the rows where empty values exist


training_data_raw = (training_data_raw[~training_data_raw.eq("").any(axis=1)])
print("We have retained", training_data_raw.shape[0]/original_length, " of the original dataset after removing empty values")

X = pd.get_dummies(training_data_raw, columns=['Var_1', 'Spending_Score', 'Profession', 'Gender', 'Graduated', 'Ever_Married', 'Segmentation'])

imputer = SimpleImputer(strategy='most_frequent')
columns = X.columns
X[columns] = imputer.fit_transform(X[columns])


X = X.apply(pd.to_numeric)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


subset_train = X[["Age","Work_Experience", "Family_Size", "Gender_Male", "Profession_Artist", "Spending_Score_High", "Spending_Score_Average", "Spending_Score_Low"]]
subset_train = subset_train[subset_train["Profession_Artist"] == 1]
subset_train_M = subset_train[subset_train["Gender_Male"] == 1]
subset_train_M_clusters = subset_train_M[["Age", "Work_Experience", "Family_Size"]]
subset_train_F = subset_train[subset_train["Gender_Male"] == 0]
subset_train_F_clusters = subset_train_F[["Age", "Work_Experience", "Family_Size"]]


inertias = []
for i in range(1, 8):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(subset_train_M_clusters)
    inertias.append(kmeans.inertia_)
    
plt.plot(range(1, 8), inertias, marker='o')
plt.title('Training Set Male Artist - Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.show()

labels = kmeans.predict(subset_train_M_clusters)

optimal_num_clusters = 4
kmeans = KMeans(n_clusters=optimal_num_clusters, init='k-means++', random_state=42)
kmeans.fit(subset_train_M_clusters)

num = np.array([0,1,2,3])
totals_M = np.array([0,0,0,0])
for i in range(4):
    totals_M[i] = sum(labels==i)

centroids_train_M = kmeans.cluster_centers_

plt.figure(figsize=(10, 6))
plt.bar(num, totals_M)
plt.title('Training Set Male Artist - Number of Components in Each Cluster')
plt.xlabel('Cluster Label')
plt.ylabel('Number of Components')
plt.show()

x = centroids_train_M[:, 0]
y = centroids_train_M[:, 1]
z = centroids_train_M[:, 2]
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z, c='r', marker='o', s=200, label='Centroids')
for i, txt in enumerate(range(len(centroids_train_M))):
    ax.text(x[i], y[i], z[i], f'Cluster {i}', size=10, zorder=1, color='k')
ax.set_xlabel('Age')
ax.set_ylabel('Work Experience')
ax.set_zlabel('Family Size')
ax.set_title('Training Set Male Artist - Centroids')
plt.legend()
plt.show()


subset_train_M["labels"] = labels
target = (subset_train_M["Spending_Score_High"])

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(subset_train_M['Age'], subset_train_M['Work_Experience'], subset_train_M['Family_Size'], c=subset_train_M['labels'], cmap='viridis')

ax.set_xlabel('Age')
ax.set_ylabel('Work Experience')
ax.set_zlabel('Family Size')
ax.set_title('Training Set Male Artist - Data Points Labeled by Cluster')
cbar = fig.colorbar(scatter)
cbar.set_label('Labels')
plt.show()


print("Training Set Male Artist Results \n")
print("Number of components in cluster 0:", sum(labels == 0), "; with centroid:", centroids_train_M[0])
print("Number of components in cluster 1:", sum(labels == 1), "; with centroid:", centroids_train_M[1])
print("Number of components in cluster 2:", sum(labels == 2), "; with centroid:", centroids_train_M[2])
print("Number of components in cluster 3:", sum(labels == 3), "; with centroid:", centroids_train_M[3])



subset_train_M_centroid0 = (subset_train_M[subset_train_M["labels"] == 0])
subset_train_M_centroid1 = (subset_train_M[subset_train_M["labels"] == 1])
subset_train_M_centroid2 = (subset_train_M[subset_train_M["labels"] == 2])
subset_train_M_centroid3 = (subset_train_M[subset_train_M["labels"] == 3])

percent_low_spender_train_M_0 = (sum(subset_train_M_centroid0["Spending_Score_Low"])/totals_M[0])
percent_low_spender_train_M_3 = (sum(subset_train_M_centroid1["Spending_Score_Low"])/totals_M[1])
percent_low_spender_train_M_1 = (sum(subset_train_M_centroid2["Spending_Score_Low"])/totals_M[2])
percent_low_spender_train_M_2 = (sum(subset_train_M_centroid3["Spending_Score_Low"])/totals_M[3])

#Based on training dataset, cluster 0 is most similar to cluster 0, cluster 1 is most similar to cluster 3, cluster 2 is most similar to cluster 1, and cluster 3 is most similar to cluster 2
predict_M_percents_low_spender = np.array([percent_low_spender_train_M_0, percent_low_spender_train_M_1, percent_low_spender_train_M_2, percent_low_spender_train_M_3])

#print(predict_M_percents_low_spender)




In [None]:

subset_train = X[["Age","Work_Experience", "Family_Size", "Gender_Male", "Profession_Artist", "Spending_Score_High", "Spending_Score_Average", "Spending_Score_Low"]]
subset_train = subset_train[subset_train["Profession_Artist"] == 1]
subset_train_M = subset_train[subset_train["Gender_Male"] == 1]
subset_train_M_clusters = subset_train_M[["Age", "Work_Experience", "Family_Size"]]
subset_train_F = subset_train[subset_train["Gender_Male"] == 0]
subset_train_F_clusters = subset_train_F[["Age", "Work_Experience", "Family_Size"]]

inertias = []
for i in range(1, 8):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(subset_train_F_clusters)
    inertias.append(kmeans.inertia_)

plt.plot(range(1, 8), inertias, marker='o')
plt.title('Training Set Female Artist - Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.show()

optimal_num_clusters = 4
kmeans = KMeans(n_clusters=optimal_num_clusters, init='k-means++', random_state=42)
kmeans.fit(subset_train_F_clusters)

labels = kmeans.predict(subset_train_F_clusters)
centroids_train_F = kmeans.cluster_centers_

num = np.array([0,1,2,3])
totals_F = np.array([0,0,0,0])
for i in range(4):
    totals_F[i] = sum(labels==i)

plt.figure(figsize=(10, 6))
plt.bar(num, totals_F)
plt.title('Training Set Female Artist - Number of Components in Each Cluster')
plt.xlabel('Cluster Label')
plt.ylabel('Number of Components')
plt.show()

x = centroids_train_F[:, 0]
y = centroids_train_F[:, 1]
z = centroids_train_F[:, 2]
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z, c='r', marker='o', s=200, label='Centroids')
for i, txt in enumerate(range(len(centroids_train_F))):
    ax.text(x[i], y[i], z[i], f'Cluster {i}', size=10, zorder=1, color='k')
ax.set_xlabel('Age')
ax.set_ylabel('Work Experience')
ax.set_zlabel('Family Size')
ax.set_title('Training Set Female Artist - Centroids')
plt.legend()
plt.show()

subset_train_F["labels"] = labels
target = (subset_train_F["Spending_Score_High"])


fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(subset_train_F['Age'], subset_train_F['Work_Experience'], subset_train_F['Family_Size'], c=subset_train_F['labels'], cmap='viridis')

ax.set_xlabel('Age')
ax.set_ylabel('Work Experience')
ax.set_zlabel('Family Size')
ax.set_title('Training Set Female Artist - Data Points Labeled by Cluster')
cbar = fig.colorbar(scatter)
cbar.set_label('Labels')
plt.show()

print("Training Set Female Artist Results \n")
print("Number of components in cluster 0:", totals_F[0], "; with centroid:", centroids_train_F[0])
print("Number of components in cluster 1:", totals_F[1], "; with centroid:", centroids_train_F[1])
print("Number of components in cluster 2:", totals_F[2], "; with centroid:", centroids_train_F[2])
print("Number of components in cluster 3:", totals_F[3], "; with centroid:", centroids_train_F[3])


subset_train_F_centroid0 = (subset_train_F[subset_train_F["labels"] == 0])
subset_train_F_centroid1 = (subset_train_F[subset_train_F["labels"] == 1])
subset_train_F_centroid2 = (subset_train_F[subset_train_F["labels"] == 2])
subset_train_F_centroid3 = (subset_train_F[subset_train_F["labels"] == 3])

percent_low_spender_train_F_0 = (sum(subset_train_F_centroid0["Spending_Score_Low"])/totals_F[0])
percent_low_spender_train_F_1 = (sum(subset_train_F_centroid1["Spending_Score_Low"])/totals_F[1])
percent_low_spender_train_F_2 = (sum(subset_train_F_centroid2["Spending_Score_Low"])/totals_F[2])
percent_low_spender_train_F_3 = (sum(subset_train_F_centroid3["Spending_Score_Low"])/totals_F[3])


predict_F_percents_low_spender = np.array([percent_low_spender_train_F_0, percent_low_spender_train_F_1, percent_low_spender_train_F_2, percent_low_spender_train_F_3])
#print(predict_F_percents_low_spender)



In [None]:
testing_data_raw = pd.read_csv("Test.csv")
testing_data_raw.drop('ID', axis=1, inplace=True)
testing_data_raw.dropna(inplace=True)
Y = pd.get_dummies(testing_data_raw, columns=['Var_1', 'Spending_Score', 'Profession', 'Gender', 'Graduated', 'Ever_Married'])
imputer = SimpleImputer(strategy='most_frequent')
columns = Y.columns
Y[columns] = imputer.fit_transform(Y[columns])

Y = Y.apply(pd.to_numeric)

scaler = StandardScaler()
Y_scaled = scaler.fit_transform(Y)

In [None]:


subset_test = Y[["Age","Work_Experience", "Family_Size", "Gender_Male", "Profession_Artist", "Spending_Score_High", "Spending_Score_Average", "Spending_Score_Low"]]
subset_test = subset_test[subset_test["Profession_Artist"] == 1]
subset_test_M = subset_test[subset_test["Gender_Male"] == 1]
subset_test_M_clusters = subset_test_M[["Age", "Work_Experience", "Family_Size"]]
subset_test_F = subset_test[subset_test["Gender_Male"] == 0]
subset_test_F_clusters = subset_test_F[["Age", "Work_Experience", "Family_Size"]]


In [None]:

subset_test_M_clusters_values = subset_test_M_clusters.iloc[:].to_numpy()
diff_centroid0_M = np.sum((abs(centroids_train_F[0] - subset_test_M_clusters_values)) **2, axis =1)
diff_centroid1_M = np.sum((abs(centroids_train_F[1] - subset_test_M_clusters_values)) **2, axis = 1)
diff_centroid2_M = np.sum((abs(centroids_train_F[2] - subset_test_M_clusters_values)) **2, axis = 1)
diff_centroid3_M = np.sum((abs(centroids_train_F[3] - subset_test_M_clusters_values)) **2, axis = 1)

mins_test_M = []

for i in range(len(diff_centroid0_M)):
    values_M = [diff_centroid0_M[i], diff_centroid1_M[i], diff_centroid2_M[i], diff_centroid3_M[i]]
    mins_test_M.append(np.argmin(values_M))

test_M_labels = subset_test_M
test_M_labels["labels"] = mins_test_M

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(test_M_labels['Age'], test_M_labels['Work_Experience'], test_M_labels['Family_Size'], c=test_M_labels['labels'], cmap='viridis')
ax.set_xlabel('Age')
ax.set_ylabel('Work Experience')
ax.set_zlabel('Family Size')
ax.set_title('Testing Set Male Artist - Data Points Labeled by Cluster')
cbar = fig.colorbar(scatter)
cbar.set_label('Labels')
plt.show()




test_label_0_M = test_M_labels[test_M_labels["labels"] == 0].mean()
test_label_1_M = test_M_labels[test_M_labels["labels"] == 1].mean()
test_label_2_M = test_M_labels[test_M_labels["labels"] == 2].mean()
test_label_3_M = test_M_labels[test_M_labels["labels"] == 3].mean()

test_centroid_0 = np.array(test_label_0_M[0:3])
test_centroid_1 = np.array(test_label_1_M[0:3])
test_centroid_2 = np.array(test_label_2_M[0:3])
test_centroid_3 = np.array(test_label_3_M[0:3])

subset_test_M_centroid0 = (test_M_labels[test_M_labels["labels"] == 0])
subset_test_M_centroid1 = (test_M_labels[test_M_labels["labels"] == 1])
subset_test_M_centroid2 = (test_M_labels[test_M_labels["labels"] == 2])
subset_test_M_centroid3 = (test_M_labels[test_M_labels["labels"] == 3])

totals_M_test = np.array([len(subset_test_M_centroid0), len(subset_test_M_centroid1), len(subset_test_M_centroid2),len(subset_test_M_centroid3)])
plt.figure(figsize=(10, 6))
plt.bar(num, totals_M_test)
plt.title('Testing Set Male Artist - Number of Components in Each Cluster')
plt.xlabel('Cluster Label')
plt.ylabel('Number of Components')
plt.show()

centroid_test_M = (np.array([test_centroid_0, test_centroid_1, test_centroid_2, test_centroid_3]))

x = centroid_test_M[:, 0]
y = centroid_test_M[:, 1]
z = centroid_test_M[:, 2]
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z, c='r', marker='o', s=200, label='Centroids')
for i, txt in enumerate(range(len(centroid_test_M))):
    ax.text(x[i], y[i], z[i], f'Cluster {i}', size=10, zorder=1, color='k')
ax.set_xlabel('Age')
ax.set_ylabel('Work Experience')
ax.set_zlabel('Family Size')
ax.set_title('Testing Set Male Artist - Centroids')
plt.legend()
plt.show()

print("Results for Male Artists - Testing \n")


print("Number of components in cluster 0:", totals_M_test[0], "; with centroid:", centroid_test_M[0])
print("Number of components in cluster 1:", totals_M_test[1], "; with centroid:", centroid_test_M[1])
print("Number of components in cluster 2:", totals_M_test[2], "; with centroid:", centroid_test_M[2])
print("Number of components in cluster 3:", totals_M_test[3], "; with centroid:", centroid_test_M[3])

print("Prediction: low spenders in cluster 0 based on training model:", predict_M_percents_low_spender[0] * len(subset_test_M_centroid0))
print("Prediction: low spenders in cluster 1 based on training model:",predict_M_percents_low_spender[1] * len(subset_test_M_centroid1))
print("Prediction: low spenders in cluster 2 based on training model:",predict_M_percents_low_spender[2] * len(subset_test_M_centroid2))
print("Prediction: low spenders in cluster 3 based on training model:", predict_M_percents_low_spender[3] * len(subset_test_M_centroid3))


print("Actual number of low spenders in cluster 0", sum(subset_test_M_centroid0["Spending_Score_Low"]))
print("Actual number of low spenders in cluster 1",sum(subset_test_M_centroid1["Spending_Score_Low"]))
print("Actual number of low spenders in cluster 2",sum(subset_test_M_centroid2["Spending_Score_Low"]))
print("Actual number of low spenders in cluster 3",sum(subset_test_M_centroid3["Spending_Score_Low"]))



In [None]:
subset_test_F_clusters_values = subset_test_F_clusters.iloc[:].to_numpy()
diff_centroid0_F = np.sum((abs(centroids_train_F[0] - subset_test_F_clusters_values)) **2, axis =1)
diff_centroid1_F = np.sum((abs(centroids_train_F[1] - subset_test_F_clusters_values)) **2, axis = 1)
diff_centroid2_F = np.sum((abs(centroids_train_F[2] - subset_test_F_clusters_values)) **2, axis = 1)
diff_centroid3_F = np.sum((abs(centroids_train_F[3] - subset_test_F_clusters_values)) **2, axis = 1)

mins_test_F = []

for i in range(len(diff_centroid0_F)):
    values_F = [diff_centroid0_F[i], diff_centroid1_F[i], diff_centroid2_F[i], diff_centroid3_F[i]]
    mins_test_F.append(np.argmin(values_F))

test_F_labels = (subset_test_F)
test_F_labels["labels"] = mins_test_F

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(test_F_labels['Age'], test_F_labels['Work_Experience'], test_F_labels['Family_Size'], c=test_F_labels['labels'], cmap='viridis')
ax.set_xlabel('Age')
ax.set_ylabel('Work Experience')
ax.set_zlabel('Family Size')
ax.set_title('Testing Set Female Artist - Data Points Labeled by Cluster')
cbar = fig.colorbar(scatter)
cbar.set_label('Labels')
plt.show()

test_label_0 = test_F_labels[test_F_labels["labels"] == 0].mean()
test_label_1 = test_F_labels[test_F_labels["labels"] == 1].mean()
test_label_2 = test_F_labels[test_F_labels["labels"] == 2].mean()
test_label_3 = test_F_labels[test_F_labels["labels"] == 3].mean()

test_centroid_0 = np.array(test_label_0[0:3])
test_centroid_1 = np.array(test_label_1[0:3])
test_centroid_2 = np.array(test_label_2[0:3])
test_centroid_3 = np.array(test_label_3[0:3])

subset_test_F_centroid0 = (test_F_labels[test_F_labels["labels"] == 0])
subset_test_F_centroid1 = (test_F_labels[test_F_labels["labels"] == 1])
subset_test_F_centroid2 = (test_F_labels[test_F_labels["labels"] == 2])
subset_test_F_centroid3 = (test_F_labels[test_F_labels["labels"] == 3])

totals_F_test = np.array([len(subset_test_F_centroid0), len(subset_test_F_centroid1), len(subset_test_F_centroid2),len(subset_test_F_centroid3)])
plt.figure(figsize=(10, 6))
plt.bar(num, totals_F_test)
plt.title('Testing Set Female Artist - Number of Components in Each Cluster')
plt.xlabel('Cluster Label')
plt.ylabel('Number of Components')
plt.show()

centroid_test_F = (np.array([test_centroid_0, test_centroid_1, test_centroid_2, test_centroid_3]))

x = centroid_test_F[:, 0]
y = centroid_test_F[:, 1]
z = centroid_test_F[:, 2]
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z, c='r', marker='o', s=200, label='Centroids')
for i, txt in enumerate(range(len(centroid_test_M))):
    ax.text(x[i], y[i], z[i], f'Cluster {i}', size=10, zorder=1, color='k')
ax.set_xlabel('Age')
ax.set_ylabel('Work Experience')
ax.set_zlabel('Family Size')
ax.set_title('Testing Set Female Artist - Centroids')
plt.legend()
plt.show()

print("Results for Female Artists - Testing \n")

print("Number of components in cluster 0:", totals_F_test[0], "; with centroid:", centroid_test_F[0])
print("Number of components in cluster 1:", totals_F_test[1], "; with centroid:", centroid_test_F[1])
print("Number of components in cluster 2:", totals_F_test[2], "; with centroid:", centroid_test_F[2])
print("Number of components in cluster 3:", totals_F_test[3], "; with centroid:", centroid_test_F[3])


print("Prediction: low spenders in cluster 0 based on training model:", predict_F_percents_low_spender[0] * len(subset_test_F_centroid0))
print("Prediction: low spenders in cluster 1 based on training model:",predict_F_percents_low_spender[1] * len(subset_test_F_centroid1))
print("Prediction: low spenders in cluster 2 based on training model:",predict_F_percents_low_spender[2] * len(subset_test_F_centroid2))
print("Prediction: low spenders in cluster 3 based on training model:", predict_F_percents_low_spender[3] * len(subset_test_F_centroid3))


print("Actual number of low spenders in cluster 0", sum(subset_test_F_centroid0["Spending_Score_Low"]))
print("Actual number of low spenders in cluster 1",sum(subset_test_F_centroid1["Spending_Score_Low"]))
print("Actual number of low spenders in cluster 2",sum(subset_test_F_centroid2["Spending_Score_Low"]))
print("Actual number of low spenders in cluster 3",sum(subset_test_F_centroid3["Spending_Score_Low"]))

