# Sector Analysis Pipeline Documentation

## Overview
This codebase implements a comprehensive sector analysis pipeline that processes market cap and revenue data, calculates various sector indices, performs clustering analysis, and builds a neural network classifier for sector categorization.


IMPORTING LIBRARIES

In [None]:
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
import os
import seaborn as sns
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances_argmin_min
import warnings
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

## Stage 1: Data Processing and Index Calculation

### Core Functions

#### `calculate_yoy_growth(df)`
- **Purpose**: Calculates Year-over-Year (YoY) growth for Market Cap
- **Input**: DataFrame with MarketCap column
- **Output**: DataFrame with added YoY_Growth column
- **Key Operations**:
  - Calculates percentage change over 4 periods (quarterly data)
  - Removes rows with NaN growth values


In [2]:
def calculate_yoy_growth(df):
    """Calculate the Year-over-Year (YoY) growth for Market Cap."""
    df['YoY_Growth'] = df['MarketCap'].pct_change(periods=4) * 100  # YoY percentage change
    df.dropna(subset=['YoY_Growth'], inplace=True)  # Drop rows with NaN YoY growth
    return df

#### `apply_log10_transformation(df)`
- **Purpose**: Applies logarithmic transformation to YoY growth
- **Input**: DataFrame with YoY_Growth column
- **Output**: DataFrame with added Log_YoY_Growth column
- **Note**: Adds 100 to handle negative growth values before log transformation



In [3]:
def apply_log10_transformation(df):
    """Apply log10 transformation to the YoY growth."""
    df['Log_YoY_Growth'] = np.log10(df['YoY_Growth'] + 100)  # log10(1 + YoY_Growth) to handle negative growth
    return df

#### `calculate_sector_leader_and_rank(df, sector)`
- **Purpose**: Identifies sector leaders and rankings based on performance
- **Input**: 
  - DataFrame with sector data
  - Sector name
- **Output**: 
  - Sector leader ticker
  - Sorted companies by performance
  - Total overperformance metric
- **Key Operations**:
  - Calculates sector average performance
  - Counts times each company outperforms sector average
  - Ranks companies by overperformance


In [4]:
def calculate_sector_leader_and_rank(df, sector):
    """Calculate the leader and performance ranking for a sector."""
    sector_avg = df.groupby('Date')['YoY_Growth'].mean()
    overperformance_counts = {}
    
    for ticker in df['Ticker'].unique():
        company_data = df[df['Ticker'] == ticker]
        company_data = company_data.set_index('Date')
        company_data['Sector_Avg'] = sector_avg
        
        overperformance_count = (company_data['YoY_Growth'] > company_data['Sector_Avg']).sum()
        overperformance_counts[ticker] = overperformance_count

    sorted_companies = sorted(overperformance_counts.items(), key=lambda x: x[1], reverse=True)

    sector_leader = sorted_companies[0][0]
    
    total_overperformance = sum([count**2 for _, count in sorted_companies])

    print(f"Leader for {sector} sector: {sector_leader}")
    print(f"Descending order of companies by overperformance in {sector} sector:")
    for company, count in sorted_companies:
        print(f"{company}: {count} times overperformed")
    
    return sector_leader, sorted_companies, total_overperformance

#### `calculate_sector_index(df, sorted_companies, total_overperformance)`
- **Purpose**: Creates weighted sector index based on company performance
- **Input**: 
  - Sector DataFrame
  - Sorted companies list
  - Total overperformance score
- **Output**: DataFrame with Date and Sector_Index columns
- **Key Operations**:
  - Calculates fractional contributions
  - Applies weights to company performance
  - Aggregates weighted performances

In [5]:

def calculate_sector_index(df, sorted_companies, total_overperformance):
    """Calculate the sector index based on fractional contribution of stocks."""
    fractional_contribution = {company: count**2 / total_overperformance for company, count in sorted_companies}
    
    sector_index = pd.DataFrame()
    
    for ticker in df['Ticker'].unique():
        company_data = df[df['Ticker'] == ticker].copy()
        
        if ticker in fractional_contribution:
            contribution = fractional_contribution[ticker]
            company_data['Weighted_YoY_Growth'] = company_data['YoY_Growth'] * contribution
        else:
            continue

        if sector_index.empty:
            sector_index = company_data[['Date', 'Weighted_YoY_Growth']].copy()
        else:
            sector_index = pd.merge(sector_index, company_data[['Date', 'Weighted_YoY_Growth']],
                                    on='Date', how='outer', suffixes=('', f'_{ticker}'))

    sector_index['Sector_Index'] = sector_index.filter(like='Weighted_YoY_Growth').sum(axis=1)

    return sector_index[['Date', 'Sector_Index']]

def calculate_simple_average_index(df):
    """Calculate sector index using simple average of YoY growth."""
    simple_avg_index = df.groupby('Date')['YoY_Growth'].mean().reset_index()
    simple_avg_index.rename(columns={'YoY_Growth': 'Simple_Avg_Index'}, inplace=True)
    return simple_avg_index

def plot_sector_index(sector_index, simple_avg_index, df, sector):
    """Plot the sector index (weighted and simple average) and stock values."""
    if sector_index.empty:
        print(f"No data to plot for {sector}.")
        return

    # Set up the plot
    plt.figure(figsize=(12, 8))

    # Plot contributing stock YoY values
    unique_tickers = df['Ticker'].unique()
    for ticker in unique_tickers:
        company_data = df[df['Ticker'] == ticker]
        plt.plot(company_data['Date'], company_data['YoY_Growth'], label=f"{ticker} YoY Growth", linestyle='--')

    # Plot weighted sector index
    plt.plot(sector_index['Date'], sector_index['Sector_Index'], label=f'{sector} Weighted Sector Index', color='blue', linewidth=2)

    # Plot simple average sector index
    plt.plot(simple_avg_index['Date'], simple_avg_index['Simple_Avg_Index'], label=f'{sector} Simple Average Index', color='red', linewidth=2)

    plt.title(f'Sector Index for {sector} Sector (Weighted vs Simple Average)')
    plt.xlabel('Date')
    plt.ylabel('YoY Growth / Index Value')
    plt.legend(loc='best')
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

EXECUTION OF STAGE-1

In [None]:
def main():
    # Path to the directory where sector CSV files are saved
    input_dir = "sector_mkt_cap_results"
    output_dir = "sector_wise_index"  # Directory to save sector indices

    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Process each sector CSV
    for sector_file in os.listdir(input_dir):
        if sector_file.endswith(".csv"):
            sector = sector_file.replace("_mkt_cap_quarter_end.csv", "")
            print(f"Processing {sector} sector...")

            # Load the CSV file
            file_path = os.path.join(input_dir, sector_file)
            df = pd.read_csv(file_path, parse_dates=['Date'])

            # Calculate YoY growth and apply log10 transformation
            df = calculate_yoy_growth(df)
            df = apply_log10_transformation(df)

            # Calculate sector leader, ranking and total overperformance count
            sector_leader, sorted_companies, total_overperformance = calculate_sector_leader_and_rank(df, sector)

            # Calculate the sector index based on fractional contributions
            sector_weighted_index = calculate_sector_index(df, sorted_companies, total_overperformance)

            # Calculate the simple average index
            simple_avg_index = calculate_simple_average_index(df)

            # Merge sector_index and simple_avg_index for saving
            combined_index = pd.merge(sector_weighted_index, simple_avg_index, on='Date', how='outer')

            # Save the sector index data to a CSV file
            output_file_path = os.path.join(output_dir, f"{sector}_sector_index.csv")
            combined_index.to_csv(output_file_path, index=False)
            print(f"Saved sector index for {sector} to {output_file_path}")

            # Plot the sector index (both weighted and simple average) alongside stock values
            plot_sector_index(sector_weighted_index, simple_avg_index, df, sector)

if __name__ == "__main__":
    main()

## Stage 2: Data Merging and Growth Analysis

### Key Functions

#### `merge_sector_data(mkt_cap_dir, revenue_dir, output_dir)`
- **Purpose**: Merges market cap and revenue data for sectors
- **Input**: 
  - Directory paths for market cap and revenue data
  - Output directory path
- **Output**: Dictionary of merged sector DataFrames
- **Key Operations**:
  - Identifies common sectors between datasets
  - Aligns quarterly market cap with revenue data
  - Handles date formatting and matching


In [7]:
def merge_sector_data(mkt_cap_dir, revenue_dir, output_dir="merged_sector_data"):
    """
    Merge market cap and revenue data for sectors where both datasets are available.
    Handles specific CSV structures with quarterly market cap and yearly revenue data.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    merged_sectors = {}
    
    # Get list of sectors from both directories
    mkt_cap_sectors = {f.split('_mkt_cap')[0] for f in os.listdir(mkt_cap_dir) if f.endswith('.csv')}
    revenue_sectors = {f.split('_revenue')[0] for f in os.listdir(revenue_dir) if f.endswith('.csv')}
    
    # Find common sectors
    common_sectors = mkt_cap_sectors.intersection(revenue_sectors)
    
    for sector in common_sectors:
        print(f"Processing sector: {sector}")
        
        # Read market cap data
        mkt_cap_file = f"{sector}_mkt_cap_quarter_end.csv"
        mkt_cap_path = os.path.join(mkt_cap_dir, mkt_cap_file)
        mkt_cap_df = pd.read_csv(mkt_cap_path)
        
        # Read revenue data
        revenue_file = f"{sector}_revenue.csv"
        revenue_path = os.path.join(revenue_dir, revenue_file)
        revenue_df = pd.read_csv(revenue_path)
        
        # Clean and convert dates
        try:
            # Handle the specific date format in market cap data
            mkt_cap_df['Date'] = pd.to_datetime(mkt_cap_df['Date'].str.split(' ').str[0])
        except AttributeError:
            mkt_cap_df['Date'] = pd.to_datetime(mkt_cap_df['Date'])
            
        revenue_df['date'] = pd.to_datetime(revenue_df['date'])
        
        # Extract year and quarter
        mkt_cap_df['year'] = pd.DatetimeIndex(mkt_cap_df['Date']).year
        mkt_cap_df['quarter'] = pd.DatetimeIndex(mkt_cap_df['Date']).quarter
        
        # Create a list to store merged data for each ticker
        merged_data = []
        
        # Get unique tickers from both datasets
        mkt_cap_df['Ticker'] = mkt_cap_df['Ticker'].str.upper()
        revenue_df['ticker'] = revenue_df['ticker'].str.upper()
        
        common_tickers = set(mkt_cap_df['Ticker']).intersection(set(revenue_df['ticker']))
        
        print(f"Found {len(common_tickers)} common tickers for {sector}")
        
        for ticker in common_tickers:
            ticker_mkt_cap = mkt_cap_df[mkt_cap_df['Ticker'] == ticker].copy()
            ticker_revenue = revenue_df[revenue_df['ticker'] == ticker].copy()
            
            for _, mkt_cap_row in ticker_mkt_cap.iterrows():
                matching_revenue = ticker_revenue[
                    (ticker_revenue['year'] == mkt_cap_row['year']) &
                    (ticker_revenue['quarter'] == mkt_cap_row['quarter'])
                ]
                
                if matching_revenue.empty:
                    yearly_revenue = ticker_revenue[
                        ticker_revenue['year'] == mkt_cap_row['year']
                    ]
                    if not yearly_revenue.empty:
                        revenue_value = yearly_revenue.iloc[-1]['revenue']
                        revenue_growth = yearly_revenue.iloc[-1].get('revenue_yoy_growth', np.nan)
                    else:
                        continue
                else:
                    revenue_value = matching_revenue.iloc[0]['revenue']
                    revenue_growth = matching_revenue.iloc[0].get('revenue_yoy_growth', np.nan)
                
                merged_row = {
                    'Date': mkt_cap_row['Date'],
                    'Year': mkt_cap_row['year'],
                    'Quarter': mkt_cap_row['quarter'],
                    'Ticker': ticker,
                    'MarketCap': mkt_cap_row['MarketCap'],
                    'Revenue': revenue_value,
                    'Revenue_YoY_Growth': revenue_growth,
                    'Company_Name': ticker_revenue.iloc[0].get('company_name', ticker)
                }
                merged_data.append(merged_row)
        
        if merged_data:
            merged_df = pd.DataFrame(merged_data)
            merged_df = merged_df.sort_values(['Date', 'Ticker'])
            merged_sectors[sector] = merged_df
            
            # Save merged data to a CSV file
            output_file_path = os.path.join(output_dir, f"{sector}_merged_data.csv")
            merged_df.to_csv(output_file_path, index=False)
            print(f"Successfully merged data for {sector} and saved to {output_file_path}")
        else:
            print(f"No matching data found for {sector}")
        
    return merged_sectors

#### `calculate_growth_indicator(value, mean, std_dev)`
- **Purpose**: Determines growth status based on statistical thresholds
- **Input**: 
  - Value to evaluate
  - Mean of the distribution
  - Standard deviation of the distribution
- **Output**: Growth indicator (-1, 0, or 1)
- **Thresholds**: Uses 0.07 standard deviations from mean



In [8]:
def calculate_growth_indicator(value, mean, std_dev):
    """
    Calculate the growth indicator based on sector-specific mean and standard deviation.
    Returns 1 if value is more than 0.5 standard deviations above the mean,
    -1 if it is more than 0.5 standard deviations below, and 0 if in between.
    """
    if pd.isna(value) or pd.isna(mean) or pd.isna(std_dev):
        return 0  # Return 0 if any inputs are missing (NaN)
    
    threshold_high = mean + 0.07 * std_dev
    threshold_low = mean - 0.07 * std_dev
    
    if value > threshold_high:
        return 1
    elif value < threshold_low:
        return -1
    else:
        return 0

## Stage 3: Beta Analysis and Covariance

### Core Function

#### `calculate_beta_covariance(df, period_months)`
- **Purpose**: Calculates beta covariance over specified time periods
- **Input**: 
  - Sector DataFrame
  - Analysis period in months
- **Output**: Average absolute covariance value
- **Key Operations**:
  - Calculates returns and market weights
  - Computes rolling betas
  - Handles missing values and time series alignment

In [9]:
def calculate_beta_covariance(df, period_months):
    """
    Calculate covariance of beta over a specified period with improved handling of time series.
    """
    try:
        df = df.copy()
        
        # Ensure data is sorted by date
        df = df.sort_values(['Date', 'Ticker'])
        
        # Calculate returns for each company
        df['Returns'] = df.groupby('Ticker')['MarketCap'].pct_change()
        
        # Calculate market returns (using value-weighted market return)
        df['Market_Value'] = df.groupby('Date')['MarketCap'].transform('sum')
        df['Market_Weight'] = df['MarketCap'] / df['Market_Value']
        df['Market_Returns'] = df.groupby('Date')['Returns'].transform(lambda x: (x * df.loc[x.index, 'Market_Weight']).sum())
        
        # Set minimum periods for rolling calculations
        min_periods = max(2, period_months - 1)  # Ensure at least 2 periods for correlation
        rolling_window = period_months * 3  # Convert months to quarters (assuming quarterly data)
        
        betas_by_date = []
        
        for ticker in df['Ticker'].unique():
            ticker_data = df[df['Ticker'] == ticker].copy()
            
            if len(ticker_data) >= min_periods:
                # Calculate rolling betas
                rolling_cov = (
                    ticker_data['Returns']
                    .rolling(window=rolling_window, min_periods=min_periods)
                    .cov(ticker_data['Market_Returns'])
                )
                
                rolling_market_var = (
                    ticker_data['Market_Returns']
                    .rolling(window=rolling_window, min_periods=min_periods)
                    .var()
                )
                
                # To avoid dividing by zero, handle NaN or zero variance values
                ticker_data['Beta'] = rolling_cov / rolling_market_var.replace(0, np.nan)
                
                # Store results
                betas_by_date.append(ticker_data[['Date', 'Ticker', 'Beta']].dropna())
        
        if not betas_by_date:
            return 0
        
        # Combine all beta calculations
        all_betas = pd.concat(betas_by_date)
        
        # Create a pivot table of betas (companies x dates)
        beta_matrix = all_betas.pivot_table(
            index='Ticker',
            columns='Date',
            values='Beta',
            aggfunc='first'
        )
        
        # Remove companies with too many missing values
        min_observations = beta_matrix.shape[1] * 0.5  # Require at least 50% of dates
        beta_matrix = beta_matrix[beta_matrix.count(axis=1) >= min_observations]
        
        if beta_matrix.empty:
            return 0
        
        # Fill remaining NaN values with forward fill then backward fill
        beta_matrix = beta_matrix.fillna(method='ffill', axis=1).fillna(method='bfill', axis=1)
        
        # Calculate covariance between different dates
        cov_matrix = beta_matrix.T.cov()
        
        # Calculate average absolute covariance (excluding diagonal)
        mask = ~np.eye(cov_matrix.shape[0], dtype=bool)
        avg_cov = np.abs(cov_matrix.where(mask)).mean().mean()
        
        return float(avg_cov) if not np.isnan(avg_cov) else 0
        
    except Exception as e:
        print(f"Error in beta covariance calculation: {e}")
        return 0
    

## Stage 4: Sector Rankings and Visualization

### Key Functions

#### `calculate_sector_rankings(merged_sectors, output_file)`
- **Purpose**: Generates comprehensive sector rankings
- **Input**: Dictionary of merged sector data
- **Output**: DataFrame with sector rankings
- **Metrics**:
  - Market Cap Growth Score
  - Revenue Growth Score
  - Weighted-Simple Variance
  - 6-month Beta Covariance
  - 4-year Beta Covariance

In [10]:
def calculate_sector_rankings(merged_sectors, output_file='sector_rankings.csv'):
    """Calculate and rank sectors based on the five specified parameters"""
    rankings = []
    
    for sector, df in merged_sectors.items():
        print(f"Processing sector: {sector}")
        try:
            # Ensure data is sorted chronologically
            df = df.sort_values('Date')
            
            # 1. Market Cap YoY Growth Indicator
            df['MktCap_YoY_Change'] = df.groupby('Ticker')['MarketCap'].pct_change(periods=4) * 100
            mkt_cap_mean = df['MktCap_YoY_Change'].mean()
            mkt_cap_std = df['MktCap_YoY_Change'].std()
            df['MktCap_Growth_Indicator'] = df['MktCap_YoY_Change'].apply(
                lambda x: calculate_growth_indicator(x, mkt_cap_mean, mkt_cap_std)
            )
            
            # 2. Revenue YoY Growth Indicator
            revenue_mean = df['Revenue_YoY_Growth'].mean()
            revenue_std = df['Revenue_YoY_Growth'].std()
            df['Revenue_Growth_Indicator'] = df['Revenue_YoY_Growth'].apply(
                lambda x: calculate_growth_indicator(x, revenue_mean, revenue_std)
            )
            
            # 3. Variance between weighted and simple average
            df['Weighted_MktCap_Change'] = (
                df['MktCap_YoY_Change'] * 
                df['MarketCap'] / 
                df.groupby('Date')['MarketCap'].transform('sum')
            )
            
            # Calculate averages only for non-NaN values
            weighted_avg = df.groupby('Date')['Weighted_MktCap_Change'].sum().mean()
            simple_avg = df['MktCap_YoY_Change'].mean()
            variance_avg = abs(weighted_avg - simple_avg)
            
            # 4 & 5. Beta covariances
            print(f"Calculating 6-month beta covariance for {sector}")
            beta_6m_cov = calculate_beta_covariance(df, 2)
            
            print(f"Calculating 5-year beta covariance for {sector}")
            beta_4y_cov = calculate_beta_covariance(df, 16)
            
            rankings.append({
                'Sector': sector,
                'MktCap_Growth_Score': df['MktCap_Growth_Indicator'].mean(),
                'Revenue_Growth_Score': df['Revenue_Growth_Indicator'].mean(),
                'Weighted_Simple_Variance': variance_avg,
                'Beta_6M_Covariance': beta_6m_cov,
                'Beta_4Y_Covariance': beta_4y_cov,
                'Number_of_Companies': len(df['Ticker'].unique()),
                'Date_Range': f"{df['Date'].min().strftime('%Y-%m-%d')} to {df['Date'].max().strftime('%Y-%m-%d')}"
            })
            
            print(f"Successfully processed {sector}")
            
        except Exception as e:
            print(f"Error processing sector {sector}: {e}")
            continue
    
    # Create rankings DataFrame and save to CSV
    rankings_df = pd.DataFrame(rankings)
    rankings_df.to_csv(output_file, index=False)
    return rankings_df

#### `plot_covariance_heatmap(cov_matrix, title)`
- **Purpose**: Visualizes sector covariances
- **Input**: 
  - Covariance matrix
  - Plot title
- **Output**: Heatmap visualization
- **Features**: Uses seaborn for enhanced visualization



In [11]:
def plot_covariance_heatmap(cov_matrix, title='Sector Covariance Heatmap'):
    """
    Plot a heatmap based on the covariance matrix.
    """
    plt.figure(figsize=(12, 8))
    sns.heatmap(cov_matrix, annot=True, cmap="coolwarm", fmt=".2f", cbar_kws={'label': 'Covariance'})
    plt.title(title)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

EXECUTION


In [None]:

def main():
    mkt_cap_dir = "sector_mkt_cap_results"
    revenue_dir = "sector_revenue_results"
    
    # Merge sector data
    merged_sectors = merge_sector_data(mkt_cap_dir, revenue_dir)
    
    # Calculate rankings with new parameters
    rankings_df = calculate_sector_rankings(merged_sectors)
    
    # Print rankings for each parameter
    parameters = ['MktCap_Growth_Score', 'Revenue_Growth_Score', 'Weighted_Simple_Variance', 
                 'Beta_6M_Covariance', 'Beta_4Y_Covariance']
    
    for param in parameters:
        print(f"\nRanking of sectors based on {param}:")
        sorted_rankings = rankings_df.sort_values(param, ascending=False)
        for _, row in sorted_rankings.iterrows():
            print(f"{row['Sector']} ({row['Number_of_Companies']} companies): {row[param]:.4f}")
            print(f"Date Range: {row['Date_Range']}")

if __name__ == "__main__":
    main()

## Stage 5: Machine Learning Models

#### Performing Heirarchical Clustering

In [None]:
# Step 1: Read the CSV file
df = pd.read_csv('sector_rankings.csv')

# Step 2: Normalize each column independently (excluding the 'Sector' column)
scaler = StandardScaler()
df = df.drop(columns=['Number_of_Companies', 'Date_Range'])
X = df.drop('Sector', axis=1)  # Keep only numerical data for normalization
print(df.shape)
X_scaled = X.apply(lambda col: scaler.fit_transform(col.values.reshape(-1, 1)).flatten(), axis=0)

# Step 3: Perform hierarchical clustering
Z = linkage(X_scaled, method='complete')

# Step 4: Plot the dendrogram
plt.figure(figsize=(12, 20))
dendrogram(Z, labels=df['Sector'].values, leaf_rotation=90, leaf_font_size=10)
plt.title('Hierarchical Clustering of Sectors')
plt.xlabel('Sectors')
plt.ylabel('Euclidean Distance')
plt.tight_layout()
plt.show()


#### Using K-means seeded with the sectors with minimum distance in each cluster, we initialize the clusters and obtain expected output for each sector

In [None]:
warnings.filterwarnings("ignore")  # Ignore warnings for cleaner output

# Step 1: Load and preprocess the data
df = pd.read_csv('sector_rankings.csv')
df = df.drop(columns=['Number_of_Companies', 'Date_Range'])  # Drop unnecessary columns

# Separate 'Sector' column for labeling
sectors = df['Sector'].values
X = df.drop('Sector', axis=1)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Define seed sectors and calculate initial centroids
seed_sectors = {
    0: "Consumer Staples Merchandise Retail",
    1: "Health Care Services",
    2: "Insurance Brokers",
    3: "Movies & Entertainment",
    4: "Broadcasting"
}

# Find the rows corresponding to the seed sectors
seed_indices = [list(sectors).index(seed) for seed in seed_sectors.values()]
initial_centroids = X_scaled[seed_indices]

# Step 3: Iteratively perform KMeans until convergence
num_clusters = 5
centroids = initial_centroids
tolerance = 1e-4  # Convergence threshold
max_iterations = 100  # Safety limit on iterations
iteration = 0

while iteration < max_iterations:
    # Step 3a: Assign each point to the nearest centroid
    labels, _ = pairwise_distances_argmin_min(X_scaled, centroids)
    
    # Step 3b: Calculate new centroids as the mean of points in each cluster
    new_centroids = np.array([X_scaled[labels == k].mean(axis=0) for k in range(num_clusters)])
    
    # Step 3c: Check for convergence (if centroids do not change significantly)
    centroid_shift = np.linalg.norm(new_centroids - centroids, axis=1).max()
    print(f"Iteration {iteration + 1}, centroid shift: {centroid_shift:.6f}")
    
    if centroid_shift < tolerance:
        print("Convergence reached.")
        break
    
    centroids = new_centroids
    iteration += 1

# Final labels after convergence
final_labels = labels

# Step 4: Reduce dimensions for plotting using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
centroids_pca = pca.transform(centroids)

# Step 5: Plot the final clusters
plt.figure(figsize=(10, 7))
for cluster in range(num_clusters):
    # Plot points in each cluster
    cluster_points = X_pca[final_labels == cluster]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f"Cluster {cluster}")

print(cluster)
# Plot centroids
plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1], s=200, c='black', marker='X', label='Centroids')

# Add labels and title
plt.title("Final Clusters after KMeans Convergence")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend()
plt.grid(True)
plt.show()


#### Model Architecture
```python
Sequential([
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(num_clusters, activation='softmax')
])

In [None]:
# Step 1: Load and preprocess the data
df = pd.read_csv('sector_rankings.csv')
df = df.drop(columns=['Number_of_Companies', 'Date_Range'])  # Drop unnecessary columns

# Separate 'Sector' column for labeling and features
sectors = df['Sector'].values
X = df.drop('Sector', axis=1)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Generate cluster labels with KMeans
num_clusters = 5
# seed_sectors = {
#     0: "Internet Services & Infrastructure",
#     1: "Oil & Gas Exploration & Production",
#     2: "Interactive Media & Services",
#     3: "Broadcasting",
#     4: "Rail Transportation"
# }
seed_sectors = {
    0: "Consumer Staples Merchandise Retail",
    1: "Health Care Services",
    2: "Insurance Brokers",
    3: "Movies & Entertainment",
    4: "Broadcasting"
}

# Find the rows corresponding to the seed sectors
seed_indices = [list(sectors).index(seed) for seed in seed_sectors.values()]
initial_centroids = X_scaled[seed_indices]

# Initialize and fit KMeans
kmeans = KMeans(n_clusters=num_clusters, init=initial_centroids, n_init=1)
kmeans.fit(X_scaled)
labels = kmeans.labels_  # Use these labels as the target for training

# Step 3: Prepare data for the neural network
# Convert labels to categorical (one-hot encoding)
y = to_categorical(labels, num_clusters)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.4, random_state=42)

# Step 4: Build the neural network model
model = Sequential([
    Dense(128, input_shape=(X_train.shape[1],), activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(num_clusters, activation='softmax')  # Output layer with softmax for classification
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Step 5: Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=8, validation_split=0.3, verbose=1)

# Step 6: Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Predict clusters for new data points
predictions = model.predict(X_test)
predicted_clusters = np.argmax(predictions, axis=1)

# Output some test predictions
for i in range(len(y_test)):  # Show first 5 predictions
    print(f"{i}.True cluster: {np.argmax(y_test[i])}, Predicted cluster: {predicted_clusters[i]}")


## Save the model

In [16]:
model.save("sector_classification_model.keras")