# Weekly Assessment-4

## Step 1: Import and Load Data 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from scipy.stats import chi2_contingency
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from reportlab.lib.units import inch
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont

# Suppress warnings
# warnings.filterwarnings('ignore')

# Step 1: Data Preparation and Loading
def load_and_prepare_data(filepath):
    """
    Load the dataset and prepare it for analysis
    """
    # Determine file extension and use appropriate reading method
    # file_ext = os.path.splitext(filepath)[1].lower()
    
    # if file_ext == '.xlsx':
    #     df = pd.read_excel(filepath)
    # elif file_ext == '.csv':
    df = pd.read_csv(filepath)
    # else:
    #     raise ValueError(f"Unsupported file type: {file_ext}")
    
    # Convert Month to categorical if needed
    if 'Month' in df.columns:
        df['Month'] = pd.Categorical(df['Month'])
    
    return df

## Step 2: Data Preprocessing 

In [2]:
def preprocess_data(df):
    """
    Preprocess the data for cluster analysis and PCA
    """
    # Select numeric columns for analysis
    numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
    
    # Remove specific columns if needed
    exclude_cols = ['SexID'] if 'SexID' in numeric_columns else []
    numeric_columns = [col for col in numeric_columns if col not in exclude_cols]
    
    if len(numeric_columns) < 2:
        raise ValueError("Need at least 2 numeric columns for analysis")
    
    # Create feature matrix
    X = df[numeric_columns].to_numpy()  # Convert to numpy array explicitly
    
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, numeric_columns, df

## Step 3: Perform K-means Clustering 

In [3]:
def perform_kmeans_clustering(X_scaled, df, n_clusters=3):
    """
    Perform K-means clustering and analyze alignment with personality groups
    """
    # Ensure X_scaled is numpy array
    if not isinstance(X_scaled, np.ndarray):
        X_scaled = np.array(X_scaled)
    
    # Perform clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(X_scaled)
    
    # Encode original personality groups
    le = LabelEncoder()
    true_labels = le.fit_transform(df['Personality'])
    
    # Calculate alignment metrics
    alignment_metrics = {}
    for i in range(n_clusters):
        cluster_mask = (cluster_labels == i)
        cluster_personalities = df.loc[cluster_mask, 'Personality']
        personality_counts = cluster_personalities.value_counts(normalize=True)
        alignment_metrics[f'Cluster {i}'] = personality_counts
    
    # Perform statistical test
    contingency_table = pd.crosstab(df['Personality'], cluster_labels)
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    
    return cluster_labels, alignment_metrics, chi2, p_value

## Step 4: Perform PCA 

In [4]:
def perform_pca(X_scaled):
    """
    Perform Principal Component Analysis
    """
    # Ensure X_scaled is numpy array
    if not isinstance(X_scaled, np.ndarray):
        X_scaled = np.array(X_scaled)
    
    # Perform PCA
    pca = PCA()
    pca_results = pca.fit_transform(X_scaled)
    
    # Calculate explained variance
    explained_variance = pca.explained_variance_ratio_
    cumulative_variance = np.cumsum(explained_variance)
    
    return pca, pca_results, explained_variance, cumulative_variance

## Step 5: Create Cluster Plot

In [5]:
def create_cluster_plot(X_scaled, cluster_labels, df, numeric_columns):
    """
    Create detailed cluster visualization
    """
    # Convert to numpy array if needed
    X_scaled = np.array(X_scaled)
    
    plt.figure(figsize=(16, 8))
    
    # Use first two components for plotting
    X_plot = X_scaled[:, :2]
    
    # First subplot - Clustering
    plt.subplot(1, 2, 1)
    scatter = plt.scatter(X_plot[:, 0], X_plot[:, 1], 
                         c=cluster_labels, 
                         cmap='viridis', 
                         alpha=0.7)
    plt.title('K-Means Clustering Visualization', fontsize=12)
    plt.xlabel(f'{numeric_columns[0]} (Standardized)', fontsize=10)
    plt.ylabel(f'{numeric_columns[1]} (Standardized)', fontsize=10)
    plt.colorbar(scatter, label='Cluster')
    
    # Second subplot - Original Personality Groups
    plt.subplot(1, 2, 2)
    personality_encoder = LabelEncoder()
    personality_codes = personality_encoder.fit_transform(df['Personality'])
    scatter = plt.scatter(X_plot[:, 0], X_plot[:, 1], 
                         c=personality_codes, 
                         cmap='Set1', 
                         alpha=0.7)
    plt.title('Original Personality Groups', fontsize=12)
    plt.xlabel(f'{numeric_columns[0]} (Standardized)', fontsize=10)
    plt.ylabel(f'{numeric_columns[1]} (Standardized)', fontsize=10)
    plt.colorbar(scatter, 
                ticks=range(len(personality_encoder.classes_)),
                label='Personality')
    plt.tight_layout()
    
    plot_path = 'cluster_plot.png'
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    plt.close()
    return plot_path

## Step 6: Create PCA Plot

In [6]:
def create_pca_plot(pca_results, explained_variance, cluster_labels):
    """
    Create PCA visualization
    """
    plt.figure(figsize=(16, 8))
    
    # First two principal components scatter plot
    plt.subplot(1, 2, 1)
    scatter = plt.scatter(pca_results[:, 0], pca_results[:, 1], 
                         c=cluster_labels, 
                         cmap='viridis', 
                         alpha=0.7)
    plt.title('PCA: First Two Principal Components', fontsize=12)
    plt.xlabel('First Principal Component (PC1)', fontsize=10)
    plt.ylabel('Second Principal Component (PC2)', fontsize=10)
    plt.colorbar(scatter, label='Cluster')
    
    # Explained variance bar plot
    plt.subplot(1, 2, 2)
    plt.bar(range(1, len(explained_variance) + 1), 
            explained_variance * 100, 
            alpha=0.7)
    plt.title('Variance Explained by Principal Components', fontsize=12)
    plt.xlabel('Principal Components', fontsize=10)
    plt.ylabel('Explained Variance (%)', fontsize=10)
    plt.tight_layout()
    
    plot_path = 'pca_plot.png'
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    plt.close()
    return plot_path

## Step 7: Generate Interpretation

In [7]:
def create_comprehensive_pdf(df, X_scaled, cluster_labels, alignment_metrics, chi2, p_value, 
                           pca_results, explained_variance, cumulative_variance, 
                           numeric_columns):
    """
    Create a comprehensive PDF report
    """
    # Register a default font
    pdfmetrics.registerFont(TTFont('Arial', 'Arial.ttf'))
    
    pdf_path = 'Cardinal_Personalities_Analysis.pdf'
    c = canvas.Canvas(pdf_path, pagesize=A4)
    width, height = A4
    
    # Title
    c.setFont('Arial', 16)
    c.drawString(inch, height - inch, "Cardinal Personalities Analysis Report")
    
    # Cluster Plot
    cluster_plot_path = create_cluster_plot(X_scaled, cluster_labels, df, numeric_columns)
    c.drawImage(cluster_plot_path, inch, height/2, width-2*inch, height/3)
    
    # Plot descriptions
    c.setFont('Arial', 10)
    c.drawString(inch, height/2 - 20, "Figure 1: Cluster Analysis Results")
    c.drawString(inch, height/2 - 40, "Left: K-means clustering visualization  Right: Original personality groups")
    
    # PCA Plot on new page
    c.showPage()
    pca_plot_path = create_pca_plot(pca_results, explained_variance, cluster_labels)
    c.drawImage(pca_plot_path, inch, height/2, width-2*inch, height/3)
    
    c.drawString(inch, height/2 - 20, "Figure 2: Principal Component Analysis")
    c.drawString(inch, height/2 - 40, "Left: First two principal components  Right: Explained variance by components")
    
    # Results and interpretation on new page
    c.showPage()
    c.setFont('Arial', 14)
    c.drawString(inch, height - inch, "Analysis Results")
    
    c.setFont('Arial', 10)
    y_position = height - 2*inch
    
    # Write clustering results
    c.drawString(inch, y_position, "Clustering Results:")
    y_position -= 20
    
    for cluster, personalities in alignment_metrics.items():
        c.drawString(inch + 20, y_position, f"{cluster}:")
        y_position -= 15
        for personality, proportion in personalities.items():
            c.drawString(inch + 40, y_position, f"{personality}: {proportion:.2%}")
            y_position -= 15
        y_position -= 10
    
    # Write statistical test results
    c.drawString(inch, y_position, f"Chi-square test results:")
    y_position -= 15
    c.drawString(inch + 20, y_position, f"χ² statistic: {chi2:.2f}")
    y_position -= 15
    c.drawString(inch + 20, y_position, f"p-value: {p_value:.4f}")
    
    # Write PCA results
    y_position -= 30
    c.drawString(inch, y_position, "PCA Results:")
    y_position -= 15
    for i, var in enumerate(explained_variance[:3], 1):
        c.drawString(inch + 20, y_position, f"PC{i} explained variance: {var:.2%}")
        y_position -= 15
    
    c.save()
    print(f"PDF report saved to {pdf_path}")

def main():
    try:
        filepath = 'week-4/download-6.csv'
        
        # Load and prepare data
        print("Loading data...")
        df = load_and_prepare_data(filepath)
        
        print("Preprocessing data...")
        X_scaled, numeric_columns, df = preprocess_data(df)
        
        print(f"Data shape: {X_scaled.shape}")
        print(f"Numeric columns: {numeric_columns}")
        
        # Perform clustering
        print("Performing clustering...")
        cluster_labels, alignment_metrics, chi2, p_value = perform_kmeans_clustering(X_scaled, df)
        
        # Perform PCA
        print("Performing PCA...")
        pca, pca_results, explained_variance, cumulative_variance = perform_pca(X_scaled)
        
        # Create visualizations
        print("Creating visualizations...")
        create_cluster_plot(X_scaled, cluster_labels, df, numeric_columns)
        create_pca_plot(pca_results, explained_variance, cluster_labels)
        
        # Create PDF report
        print("Creating PDF report...")
        create_comprehensive_pdf(df, X_scaled, cluster_labels, alignment_metrics, chi2, p_value, 
                               pca_results, explained_variance, cumulative_variance, 
                               numeric_columns)
        
        print("Analysis completed successfully!")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Loading data...
Preprocessing data...
Data shape: (67, 11)
Numeric columns: ['Weight', 'WingL', 'TarsusL', 'CORT1', 'CORT2', 'CORT3', 'Exploration', 'Neophobia', 'Neophilia', 'Aggression', 'Boldness']
Performing clustering...
Performing PCA...
Creating visualizations...
Creating PDF report...
PDF report saved to Cardinal_Personalities_Analysis.pdf
Analysis completed successfully!
