In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

# Load the dataset
file_path = "EastWestAirlines.xlsx"
df = pd.read_excel(file_path)

# Drop any extra rows (if needed)
df = df.iloc[1:] if 'Data Type' in df.iloc[0].values else df

# Reset index
df = df.reset_index(drop=True)

# Convert columns to numeric (excluding ID)
df.iloc[:, 1:] = df.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')

# Drop ID column
data = df.iloc[:, 1:]

# **Check for NaN or infinite values**
if data.isnull().sum().sum() > 0:
    print("Warning: Missing values detected! Replacing with column mean.")
    data.fillna(data.mean(), inplace=True)  # Replace NaNs with column mean

if np.isinf(data.values).sum() > 0:
    print("Warning: Infinite values detected! Replacing with column mean.")
    data.replace([np.inf, -np.inf], data.mean(), inplace=True)

# Standardize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# **Check again after scaling**
if np.isnan(data_scaled).sum() > 0 or np.isinf(data_scaled).sum() > 0:
    raise ValueError("Data still contains NaN or infinite values after scaling!")

# Hierarchical Clustering
plt.figure(figsize=(10, 5))
linkage_matrix = linkage(data_scaled, method='ward')
dendrogram(linkage_matrix)
plt.title('Dendrogram for Hierarchical Clustering')
plt.xlabel('Data Points')
plt.ylabel('Euclidean Distance')
plt.show()

# Determine clusters
num_clusters = 4  # Adjust based on dendrogram
hierarchical_clusters = fcluster(linkage_matrix, num_clusters, criterion='maxclust')
df['Hierarchical Cluster'] = hierarchical_clusters

# K-Means Clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans_clusters = kmeans.fit_predict(data_scaled)
df['KMeans Cluster'] = kmeans_clusters

# DBSCAN Clustering
dbscan = DBSCAN(eps=1.5, min_samples=5)
dbscan_clusters = dbscan.fit_predict(data_scaled)
df['DBSCAN Cluster'] = dbscan_clusters

# Display cluster results
print(df[['ID', 'Hierarchical Cluster', 'KMeans Cluster', 'DBSCAN Cluster']].head())

# Scatter plot for KMeans Clustering
plt.figure(figsize=(8, 6))
sns.scatterplot(x=data_scaled[:, 0], y=data_scaled[:, 1], hue=kmeans_clusters, palette='viridis', s=100)
plt.title('K-Means Clustering')
plt.xlabel('Balance (Standardized)')
plt.ylabel('Qual_mile (Standardized)')
plt.show()



  data.fillna(data.mean(), inplace=True)  # Replace NaNs with column mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.fillna(data.mean(), inplace=True)  # Replace NaNs with column mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.fillna(data.mean(), inplace=True)  # Replace NaNs with column mean
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


ValueError: Data still contains NaN or infinite values after scaling!