In [None]:
import pandas as pd

# Load the dataset
file_path = 'C:\\Users\\Acer\\Desktop\\Data Sci Assignments\\Clustering\\EastWestAirlines.xlsx'
df = pd.read_excel(file_path, sheet_name='data')

# Display basic information about the dataset
print(df.info())
print(df.head())


In [None]:
import numpy as np

# Function to remove outliers using IQR
def remove_outliers(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    return df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Remove outliers
df_clean = remove_outliers(df)
print(df_clean.info())
print(df_clean.head())


In [None]:
# Check for any None or NaN values in the cleaned dataset
print("Missing values after outlier removal:\n", df_clean.isnull().sum())


In [None]:
from sklearn.preprocessing import StandardScaler

# Scale the features
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_clean.iloc[:, 1:]), columns=df_clean.columns[1:])

# Check for any None or NaN values in the scaled dataset
print("Missing values after scaling:\n", df_scaled.isnull().sum())
print(df_scaled.head())


In [None]:
from sklearn.preprocessing import StandardScaler

# Scale the features
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_clean.iloc[:, 1:]), columns=df_clean.columns[1:])

# Check for any None or NaN values in the scaled dataset
print("Missing values after scaling:\n", df_scaled.isnull().sum())
print(df_scaled.head())


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Visualize the distribution of each feature
df_scaled.hist(bins=30, figsize=(15, 10))
plt.show()

# Use pair plots to understand relationships between features
sns.pairplot(df_scaled)
plt.show()

# Check for correlations between features
corr_matrix = df_scaled.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()


In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

# Create a linkage matrix
Z = linkage(df_scaled, method='ward')

# Plot the dendrogram
plt.figure(figsize=(10, 7))
dendrogram(Z)
plt.title('Dendrogram for Hierarchical Clustering')
plt.xlabel('Samples')
plt.ylabel('Distance')
plt.show()

# Apply Hierarchical clustering with the optimal number of clusters (e.g., k=3)
hierarchical = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
clusters_hierarchical = hierarchical.fit_predict(df_scaled)

# Add the cluster labels to the dataset
df_scaled['Cluster_Hierarchical'] = clusters_hierarchical

# Evaluate the clustering quality using the silhouette score
silhouette_avg_hierarchical = silhouette_score(df_scaled.iloc[:, :-2], clusters_hierarchical)
print(f'Silhouette Score for Hierarchical Clustering: {silhouette_avg_hierarchical}')


In [None]:
from sklearn.cluster import DBSCAN

# Apply DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=5)
clusters_dbscan = dbscan.fit_predict(df_scaled)

# Add the cluster labels to the dataset
df_scaled['Cluster_DBSCAN'] = clusters_dbscan

# Evaluate the clustering quality using the silhouette score
# Note: For DBSCAN, clusters with label -1 are noise and should be excluded from silhouette score calculation
silhouette_avg_dbscan = silhouette_score(df_scaled.iloc[:, :-3][clusters_dbscan != -1], clusters_dbscan[clusters_dbscan != -1])
print(f'Silhouette Score for DBSCAN: {silhouette_avg_dbscan}')


In [None]:
# Hierarchical Clustering
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_scaled, x='Balance', y='Bonus_miles', hue='Cluster_Hierarchical', palette='viridis')
plt.title('Hierarchical Clustering')
plt.show()


In [None]:
# DBSCAN
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_scaled, x='Balance', y='Bonus_miles', hue='Cluster_DBSCAN', palette='viridis')
plt.title('DBSCAN Clustering')
plt.show()
