In [None]:
d=pd.read_csv('sector_mkt_cap_results/Aerospace & Defense_mkt_cap_quarter_end.csv')
d

In [None]:

def calculate_yoy_growth(df):
    """Calculate the Year-over-Year (YoY) growth for Market Cap."""
    df['YoY_Growth'] = df['MarketCap'].pct_change(periods=4) * 100  # YoY percentage change
    df.dropna(subset=['YoY_Growth'], inplace=True)  # Drop rows with NaN YoY growth
    return df

def apply_log10_transformation(df):
    """Apply log10 transformation to the YoY growth."""
    df['Log_YoY_Growth'] = np.log10(df['YoY_Growth'] + 100)  # log10(1 + YoY_Growth) to handle negative growth
    return df

def plot_yoy_growth_with_log(df, sector):
    """Plot the YoY growth and log10-transformed YoY growth side by side."""
    unique_tickers = df['Ticker'].unique()

    # Set up subplots
    fig, axs = plt.subplots(1, 2, figsize=(15, 6))

    # Plot 1: Percentage YoY Growth
    for ticker in unique_tickers:
        company_data = df[df['Ticker'] == ticker]
        axs[0].plot(company_data['Date'], company_data['YoY_Growth'], label=ticker)

    axs[0].set_title(f'Percentage YoY Growth for {sector} Sector')
    axs[0].set_xlabel('Date')
    axs[0].set_ylabel('YoY Growth (%)')
    axs[0].legend(loc='best')
    axs[0].grid(True)

    # Plot 2: Log10 YoY Growth
    for ticker in unique_tickers:
        company_data = df[df['Ticker'] == ticker]
        axs[1].plot(company_data['Date'], company_data['Log_YoY_Growth'], label=ticker)

    axs[1].set_title(f'Log10 YoY Growth for {sector} Sector')
    axs[1].set_xlabel('Date')
    axs[1].set_ylabel('Log10 YoY Growth')
    axs[1].legend(loc='best')
    axs[1].grid(True)

    plt.tight_layout()  # Adjust layout so plots don't overlap
    plt.show()

def main():
    # Path to the directory where sector CSV files are saved
    input_dir = "sector_mkt_cap_results"

    # Process each sector CSV
    for sector_file in os.listdir(input_dir):
        if sector_file.endswith(".csv"):
            sector = sector_file.replace("_mkt_cap_quarter_end.csv", "")
            print(f"Processing {sector} sector...")

            # Load the CSV file
            file_path = os.path.join(input_dir, sector_file)
            df = pd.read_csv(file_path, parse_dates=['Date'])

            # Calculate YoY growth and apply log10 transformation
            df = calculate_yoy_growth(df)
            df = apply_log10_transformation(df)

            # Plot the YoY growth in percentage and log10
            plot_yoy_growth_with_log(df, sector)

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

def calculate_yoy_growth(df):
    """Calculate the Year-over-Year (YoY) growth for Market Cap."""
    df['YoY_Growth'] = df['MarketCap'].pct_change(periods=4) * 100  # YoY percentage change
    df.dropna(subset=['YoY_Growth'], inplace=True)  # Drop rows with NaN YoY growth
    return df

def apply_log10_transformation(df):
    """Apply log10 transformation to the YoY growth."""
    df['Log_YoY_Growth'] = np.log10(df['YoY_Growth'] + 100)  # log10(1 + YoY_Growth) to handle negative growth
    return df

def calculate_sector_leader_and_rank(df, sector):
    """Calculate the leader and performance ranking for a sector."""
    # Calculate the average YoY growth of the sector per quarter
    sector_avg = df.groupby('Date')['YoY_Growth'].mean()

    # Count how many times each company outperforms the sector average
    overperformance_counts = {}
    
    for ticker in df['Ticker'].unique():
        company_data = df[df['Ticker'] == ticker]
        company_data = company_data.set_index('Date')
        company_data['Sector_Avg'] = sector_avg
        
        # Count number of times this company outperforms the sector average
        overperformance_count = (company_data['YoY_Growth'] > company_data['Sector_Avg']).sum()
        overperformance_counts[ticker] = overperformance_count

    # Sort the companies based on the number of times they overperformed
    sorted_companies = sorted(overperformance_counts.items(), key=lambda x: x[1], reverse=True)

    # Identify the leader (company with the maximum overperformance count)
    sector_leader = sorted_companies[0][0]  # The first company in the sorted list is the leader

    # Print the leader and the ranking for the sector
    print(f"Leader for {sector} sector: {sector_leader}")
    print(f"Descending order of companies by overperformance in {sector} sector:")
    for company, count in sorted_companies:
        print(f"{company}: {count} times overperformed")
    print("\n" + "="*50 + "\n")

    return sector_leader, sorted_companies

def plot_yoy_growth_with_log(df, sector):
    """Plot the YoY growth and log10-transformed YoY growth side by side."""
    unique_tickers = df['Ticker'].unique()

    # Set up subplots
    fig, axs = plt.subplots(1, 2, figsize=(15, 6))

    # Plot 1: Percentage YoY Growth
    for ticker in unique_tickers:
        company_data = df[df['Ticker'] == ticker]
        axs[0].plot(company_data['Date'], company_data['YoY_Growth'], label=ticker)

    axs[0].set_title(f'Percentage YoY Growth for {sector} Sector')
    axs[0].set_xlabel('Date')
    axs[0].set_ylabel('YoY Growth (%)')
    axs[0].legend(loc='best')
    axs[0].grid(True)

    # Plot 2: Log10 YoY Growth
    for ticker in unique_tickers:
        company_data = df[df['Ticker'] == ticker]
        axs[1].plot(company_data['Date'], company_data['Log_YoY_Growth'], label=ticker)

    axs[1].set_title(f'Log10 YoY Growth for {sector} Sector')
    axs[1].set_xlabel('Date')
    axs[1].set_ylabel('Log10 YoY Growth')
    axs[1].legend(loc='best')
    axs[1].grid(True)

    plt.tight_layout()  # Adjust layout so plots don't overlap
    plt.show()

def main():
    # Path to the directory where sector CSV files are saved
    input_dir = "sector_mkt_cap_results"

    # Process each sector CSV
    for sector_file in os.listdir(input_dir):
        if sector_file.endswith(".csv"):
            sector = sector_file.replace("_mkt_cap_quarter_end.csv", "")
            print(f"Processing {sector} sector...")

            # Load the CSV file
            file_path = os.path.join(input_dir, sector_file)
            df = pd.read_csv(file_path, parse_dates=['Date'])

            # Calculate YoY growth and apply log10 transformation
            df = calculate_yoy_growth(df)
            df = apply_log10_transformation(df)

            # Calculate sector leader and rank
            sector_leader, sorted_companies = calculate_sector_leader_and_rank(df, sector)

            # Optionally, plot the YoY growth in percentage and log10
            plot_yoy_growth_with_log(df, sector)

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

def calculate_yoy_growth(df):
    """Calculate the Year-over-Year (YoY) growth for Market Cap."""
    df['YoY_Growth'] = df['MarketCap'].pct_change(periods=4) * 100  # YoY percentage change
    df.dropna(subset=['YoY_Growth'], inplace=True)  # Drop rows with NaN YoY growth
    return df

def calculate_sector_leader_and_rank(df):
    """Calculate the leader and performance ranking for a sector."""
    sector_avg = df.groupby('Date')['YoY_Growth'].mean()
    overperformance_counts = {}

    for ticker in df['Ticker'].unique():
        company_data = df[df['Ticker'] == ticker].set_index('Date')
        company_data['Sector_Avg'] = sector_avg
        overperformance_count = (company_data['YoY_Growth'] > company_data['Sector_Avg']).sum()
        overperformance_counts[ticker] = overperformance_count

    sorted_companies = sorted(overperformance_counts.items(), key=lambda x: x[1], reverse=True)
    return sorted_companies

def calculate_weighted_average_growth(df, sorted_companies):
    """Calculate the weighted average growth for a sector based on company rankings."""
    # Assign weights to companies based on their ranking
    total_weight = sum(range(1, len(sorted_companies) + 1))  # Total weight sum, e.g., 3+2+1
    company_weights = {company: weight for company, weight in zip([sc[0] for sc in sorted_companies], range(len(sorted_companies), 0, -1))}

    print(f"Company Weights: {company_weights}")  # Debugging: Print weights

    # Initialize a DataFrame to store the weighted YoY growth
    weighted_yoy_growth = pd.DataFrame()

    # Process each company
    for ticker in df['Ticker'].unique():
        company_data = df[df['Ticker'] == ticker].copy()

        # Check if the company exists in the company_weights dictionary
        if ticker in company_weights:
            company_data['Weight'] = company_weights[ticker]
        else:
            print(f"Warning: No weight found for {ticker}, skipping.")
            continue

        # Calculate the weighted YoY growth for the company
        company_data['Weighted_YoY_Growth'] = company_data['YoY_Growth'] * company_data['Weight']

        # Initialize weighted_yoy_growth DataFrame if it's empty
        if weighted_yoy_growth.empty:
            weighted_yoy_growth = company_data[['Date', 'Weighted_YoY_Growth']].copy()
        else:
            # Merge the company's weighted YoY growth into the existing DataFrame
            weighted_yoy_growth = pd.merge(
                weighted_yoy_growth,
                company_data[['Date', 'Weighted_YoY_Growth']],
                on='Date',
                how='outer',
                suffixes=('', f'_{ticker}')
            )

    # Sum across all companies for each date to get the total weighted growth
    weighted_yoy_growth['Weighted_YoY_Growth_Total'] = weighted_yoy_growth.filter(like='Weighted_YoY_Growth').sum(axis=1)

    # Normalize by the total weight
    weighted_yoy_growth['Final_Weighted_YoY_Growth'] = weighted_yoy_growth['Weighted_YoY_Growth_Total'] / total_weight

    # Debugging: print the first few rows to ensure data exists
    print("First few rows of weighted YoY growth (after calculation):")
    print(weighted_yoy_growth.head())

    return weighted_yoy_growth[['Date', 'Final_Weighted_YoY_Growth']]



def plot_weighted_index(weighted_yoy_growth, sector, sorted_companies):
    """Plot the weighted YoY growth index for a given sector, along with individual company growth."""
    if weighted_yoy_growth.empty:
        print(f"No data to plot for {sector}.")
        return

    plt.figure(figsize=(12, 8))

    # Plot the sector's weighted index
    plt.plot(weighted_yoy_growth['Date'], weighted_yoy_growth['Final_Weighted_YoY_Growth'],
             label=f'{sector} Sector Index', color='blue', linewidth=2)

    # Plot each company's YoY growth
    for company, _ in sorted_companies:
        company_col = f'Weighted_YoY_Growth_{company}'
        if company_col in weighted_yoy_growth.  columns:
            plt.plot(weighted_yoy_growth['Date'], weighted_yoy_growth[company_col],
                     label=f'{company} YoY Growth', linestyle='--', alpha=0.8)

    plt.title(f'Weighted YoY Growth Index and Company Growth for {sector} Sector')
    plt.xlabel('Date')
    plt.ylabel('YoY Growth (%)')
    plt.legend()
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()

    plt.show()



def main():
    # Path to the directory where sector CSV files are saved
    input_dir = "sector_mkt_cap_results"

    # Process each sector CSV
    for sector_file in os.listdir(input_dir):
        if sector_file.endswith(".csv"):
            sector = sector_file.replace("_mkt_cap_quarter_end.csv", "")
            print(f"Processing {sector} sector...")

            # Load the CSV file
            file_path = os.path.join(input_dir, sector_file)
            df = pd.read_csv(file_path, parse_dates=['Date'])

            # Calculate YoY growth
            df = calculate_yoy_growth(df)

            # Calculate sector leader and rank
            sorted_companies = calculate_sector_leader_and_rank(df)

            # Calculate weighted average YoY growth
            weighted_yoy_growth = calculate_weighted_average_growth(df, sorted_companies)

            # Plot the weighted index
            plot_weighted_index(weighted_yoy_growth, sector, sorted_companies)

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

def calculate_yoy_growth(df):
    """Calculate the Year-over-Year (YoY) growth for Market Cap."""
    df['YoY_Growth'] = df['MarketCap'].pct_change(periods=4) * 100  # YoY percentage change
    df.dropna(subset=['YoY_Growth'], inplace=True)  # Drop rows with NaN YoY growth
    return df

def apply_log10_transformation(df):
    """Apply log10 transformation to the YoY growth."""
    df['Log_YoY_Growth'] = np.log10(df['YoY_Growth'] + 100)  # log10(1 + YoY_Growth) to handle negative growth
    return df

def calculate_sector_leader_and_rank(df, sector):
    """Calculate the leader and performance ranking for a sector."""
    sector_avg = df.groupby('Date')['YoY_Growth'].mean()
    overperformance_counts = {}
    
    for ticker in df['Ticker'].unique():
        company_data = df[df['Ticker'] == ticker]
        company_data = company_data.set_index('Date')
        company_data['Sector_Avg'] = sector_avg
        
        overperformance_count = (company_data['YoY_Growth'] > company_data['Sector_Avg']).sum()
        overperformance_counts[ticker] = overperformance_count

    sorted_companies = sorted(overperformance_counts.items(), key=lambda x: x[1], reverse=True)

    sector_leader = sorted_companies[0][0]
    
    total_overperformance = sum([count for _, count in sorted_companies])

    # Print sector leaders and overperformance counts
    print(f"Leader for {sector} sector: {sector_leader}")
    print(f"Descending order of companies by overperformance in {sector} sector:")
    for company, count in sorted_companies:
        print(f"{company}: {count} times overperformed")
    
    return sector_leader, sorted_companies, total_overperformance

def calculate_sector_index(df, sorted_companies, total_overperformance):
    """Calculate the sector index based on fractional contribution of stocks."""
    # Create a dictionary of fractional contributions for each company
    fractional_contribution = {company: count / total_overperformance for company, count in sorted_companies}
    
    # Initialize an empty DataFrame to store the sector index values
    sector_index = pd.DataFrame()
    
    for ticker in df['Ticker'].unique():
        company_data = df[df['Ticker'] == ticker].copy()
        
        # Assign the fractional contribution for the stock
        if ticker in fractional_contribution:
            contribution = fractional_contribution[ticker]
            company_data['Weighted_YoY_Growth'] = company_data['YoY_Growth'] * contribution
        else:
            continue

        # Sum the weighted growth into the sector index
        if sector_index.empty:
            sector_index = company_data[['Date', 'Weighted_YoY_Growth']].copy()
        else:
            sector_index = pd.merge(sector_index, company_data[['Date', 'Weighted_YoY_Growth']],
                                    on='Date', how='outer', suffixes=('', f'_{ticker}'))

    # Sum across all companies for each date
    sector_index['Sector_Index'] = sector_index.filter(like='Weighted_YoY_Growth').sum(axis=1)

    return sector_index[['Date', 'Sector_Index']]

def plot_sector_index(sector_index, sector):
    """Plot the sector index over time."""
    if sector_index.empty:
        print(f"No data to plot for {sector}.")
        return

    plt.figure(figsize=(10, 6))
    plt.plot(sector_index['Date'], sector_index['Sector_Index'], label=f'{sector} Sector Index', color='blue', linewidth=2)
    plt.title(f'Sector Index for {sector} Sector')
    plt.xlabel('Date')
    plt.ylabel('Index Value')
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def main():
    # Path to the directory where sector CSV files are saved
    input_dir = "sector_mkt_cap_results"

    # Process each sector CSV
    for sector_file in os.listdir(input_dir):
        if sector_file.endswith(".csv"):
            sector = sector_file.replace("_mkt_cap_quarter_end.csv", "")
            print(f"Processing {sector} sector...")

            # Load the CSV file
            file_path = os.path.join(input_dir, sector_file)
            df = pd.read_csv(file_path, parse_dates=['Date'])

            # Calculate YoY growth and apply log10 transformation
            df = calculate_yoy_growth(df)
            df = apply_log10_transformation(df)

            # Calculate sector leader, ranking and total overperformance count
            sector_leader, sorted_companies, total_overperformance = calculate_sector_leader_and_rank(df, sector)

            # Calculate the sector index based on fractional contributions
            sector_index = calculate_sector_index(df, sorted_companies, total_overperformance)

            # Plot the sector index
            plot_sector_index(sector_index, sector)

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

def calculate_yoy_growth(df):
    """Calculate the Year-over-Year (YoY) growth for Market Cap."""
    df['YoY_Growth'] = df['MarketCap'].pct_change(periods=4) * 100  # YoY percentage change
    df.dropna(subset=['YoY_Growth'], inplace=True)  # Drop rows with NaN YoY growth
    return df

def apply_log10_transformation(df):
    """Apply log10 transformation to the YoY growth."""
    df['Log_YoY_Growth'] = np.log10(df['YoY_Growth'] + 100)  # log10(1 + YoY_Growth) to handle negative growth
    return df

def calculate_sector_leader_and_rank(df, sector):
    """Calculate the leader and performance ranking for a sector."""
    sector_avg = df.groupby('Date')['YoY_Growth'].mean()
    overperformance_counts = {}
    
    for ticker in df['Ticker'].unique():
        company_data = df[df['Ticker'] == ticker]
        company_data = company_data.set_index('Date')
        company_data['Sector_Avg'] = sector_avg
        
        overperformance_count = (company_data['YoY_Growth'] > company_data['Sector_Avg']).sum()
        overperformance_counts[ticker] = overperformance_count

    sorted_companies = sorted(overperformance_counts.items(), key=lambda x: x[1], reverse=True)

    sector_leader = sorted_companies[0][0]
    
    total_overperformance = sum([count**2 for _, count in sorted_companies])

    print(f"Leader for {sector} sector: {sector_leader}")
    print(f"Descending order of companies by overperformance in {sector} sector:")
    for company, count in sorted_companies:
        print(f"{company}: {count} times overperformed")
    
    return sector_leader, sorted_companies, total_overperformance

def calculate_sector_index(df, sorted_companies, total_overperformance):
    """Calculate the sector index based on fractional contribution of stocks."""
    fractional_contribution = {company: count**2 / total_overperformance for company, count in sorted_companies}
    
    sector_index = pd.DataFrame()
    
    for ticker in df['Ticker'].unique():
        company_data = df[df['Ticker'] == ticker].copy()
        
        if ticker in fractional_contribution:
            contribution = fractional_contribution[ticker]
            company_data['Weighted_YoY_Growth'] = company_data['YoY_Growth'] * contribution
        else:
            continue

        if sector_index.empty:
            sector_index = company_data[['Date', 'Weighted_YoY_Growth']].copy()
        else:
            sector_index = pd.merge(sector_index, company_data[['Date', 'Weighted_YoY_Growth']],
                                    on='Date', how='outer', suffixes=('', f'_{ticker}'))

    sector_index['Sector_Index'] = sector_index.filter(like='Weighted_YoY_Growth').sum(axis=1)

    return sector_index[['Date', 'Sector_Index']]

def calculate_simple_average_index(df):
    """Calculate sector index using simple average of YoY growth."""
    simple_avg_index = df.groupby('Date')['YoY_Growth'].mean().reset_index()
    simple_avg_index.rename(columns={'YoY_Growth': 'Simple_Avg_Index'}, inplace=True)
    return simple_avg_index

def plot_sector_index(sector_index, simple_avg_index, df, sector):
    """Plot the sector index (weighted and simple average) and stock values."""
    if sector_index.empty:
        print(f"No data to plot for {sector}.")
        return

    # Set up the plot
    plt.figure(figsize=(12, 8))

    # Plot contributing stock YoY values
    unique_tickers = df['Ticker'].unique()
    for ticker in unique_tickers:
        company_data = df[df['Ticker'] == ticker]
        plt.plot(company_data['Date'], company_data['YoY_Growth'], label=f"{ticker} YoY Growth", linestyle='--')

    # Plot weighted sector index
    plt.plot(sector_index['Date'], sector_index['Sector_Index'], label=f'{sector} Weighted Sector Index', color='blue', linewidth=2)

    # Plot simple average sector index
    plt.plot(simple_avg_index['Date'], simple_avg_index['Simple_Avg_Index'], label=f'{sector} Simple Average Index', color='red', linewidth=2)

    plt.title(f'Sector Index for {sector} Sector (Weighted vs Simple Average)')
    plt.xlabel('Date')
    plt.ylabel('YoY Growth / Index Value')
    plt.legend(loc='best')
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def main():
    # Path to the directory where sector CSV files are saved
    input_dir = "sector_mkt_cap_results"
    output_dir = "sector_wise_index"  # Directory to save sector indices

    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Process each sector CSV
    for sector_file in os.listdir(input_dir):
        if sector_file.endswith(".csv"):
            sector = sector_file.replace("_mkt_cap_quarter_end.csv", "")
            print(f"Processing {sector} sector...")

            # Load the CSV file
            file_path = os.path.join(input_dir, sector_file)
            df = pd.read_csv(file_path, parse_dates=['Date'])

            # Calculate YoY growth and apply log10 transformation
            df = calculate_yoy_growth(df)
            df = apply_log10_transformation(df)

            # Calculate sector leader, ranking and total overperformance count
            sector_leader, sorted_companies, total_overperformance = calculate_sector_leader_and_rank(df, sector)

            # Calculate the sector index based on fractional contributions
            sector_weighted_index = calculate_sector_index(df, sorted_companies, total_overperformance)

            # Calculate the simple average index
            simple_avg_index = calculate_simple_average_index(df)

            # Merge sector_index and simple_avg_index for saving
            combined_index = pd.merge(sector_weighted_index, simple_avg_index, on='Date', how='outer')

            # Save the sector index data to a CSV file
            output_file_path = os.path.join(output_dir, f"{sector}_sector_index.csv")
            combined_index.to_csv(output_file_path, index=False)
            print(f"Saved sector index for {sector} to {output_file_path}")

            # Plot the sector index (both weighted and simple average) alongside stock values
            plot_sector_index(sector_weighted_index, simple_avg_index, df, sector)

if __name__ == "__main__":
    main()



In [None]:
import yfinance as yf
import pandas as pd

# Fetch data for a specific ticker
ticker = "AAPL"  # Example: Apple Inc.
stock = yf.Ticker(ticker)

# Get quarterly financials (revenue data is included)
quarterly_financials = stock.quarterly_financials.T

# Extract revenue data
revenue_data = quarterly_financials[['Total Revenue']]

print(revenue_data)


In [None]:
import numpy as np

# Convert revenue to numeric values (if necessary)
revenue_data['Total Revenue'] = pd.to_numeric(revenue_data['Total Revenue'], errors='coerce')

# Sort data by date
revenue_data = revenue_data.sort_index()

# Shift revenue by 4 quarters (1 year) to calculate YoY
revenue_data['YoY Growth'] = (revenue_data['Total Revenue'] - revenue_data['Total Revenue'].shift(4)) / revenue_data['Total Revenue'].shift(4) * 100

# Drop rows with NaN values caused by the shift
revenue_data = revenue_data.dropna()

print(revenue_data)


In [None]:
import pandas as pd
import numpy as np
import os

def calculate_yoy_growth(df):
    """Calculate the Year-over-Year (YoY) growth for Market Cap."""
    df['YoY_Growth'] = df['MarketCap'].pct_change(periods=4) * 100  # YoY percentage change
    df.dropna(subset=['YoY_Growth'], inplace=True)  # Drop rows with NaN YoY growth
    return df

def calculate_sector_index_variations(df, sorted_companies, total_overperformance):
    """Calculate sector index and return variations between weighted and simple average index."""
    fractional_contribution = {company: count**2 / total_overperformance for company, count in sorted_companies}
    
    sector_index = pd.DataFrame()

    for ticker in df['Ticker'].unique():
        company_data = df[df['Ticker'] == ticker].copy()

        if ticker in fractional_contribution:
            contribution = fractional_contribution[ticker]
            company_data['Weighted_YoY_Growth'] = company_data['YoY_Growth'] * contribution
        else:
            continue

        if sector_index.empty:
            sector_index = company_data[['Date', 'Weighted_YoY_Growth']].copy()
        else:
            sector_index = pd.merge(sector_index, company_data[['Date', 'Weighted_YoY_Growth']],
                                    on='Date', how='outer', suffixes=('', f'_{ticker}'))

    sector_index['Weighted_Index'] = sector_index.filter(like='Weighted_YoY_Growth').sum(axis=1)

    # Calculate simple average
    simple_avg_index = df.groupby('Date')['YoY_Growth'].mean().reset_index()
    sector_index = pd.merge(sector_index, simple_avg_index, on='Date', how='left')
    sector_index.rename(columns={'YoY_Growth': 'Simple_Avg_Index'}, inplace=True)

    # Calculate the difference between the weighted and simple averages
    sector_index['Difference'] = (sector_index['Weighted_Index'] - sector_index['Simple_Avg_Index']).abs()

    return sector_index[['Date', 'Weighted_Index', 'Simple_Avg_Index', 'Difference']]

def calculate_variance_and_positive_ratio(df):
    """Calculate variance of YoY growth and the positive-to-negative growth ratio."""
    variance = df['YoY_Growth'].var()

    positive_growth_count = (df['YoY_Growth'] > 0).sum()
    negative_growth_count = (df['YoY_Growth'] < 0).sum()

    if negative_growth_count > 0:
        pos_to_neg_ratio = positive_growth_count / negative_growth_count
    else:
        pos_to_neg_ratio = float('inf')  # All positive growth

    return variance, pos_to_neg_ratio

def get_sector_rankings(input_dir, output_file='sector_rankings.csv'):
    rankings_variation = {}
    rankings_variance = {}
    rankings_positive_negative_ratio = {}

    for sector_file in os.listdir(input_dir):
        if sector_file.endswith(".csv"):
            sector = sector_file.replace("_mkt_cap_quarter_end.csv", "")
            print(f"Processing {sector} sector...")

            # Load the CSV file and filter data for 2019-2024
            file_path = os.path.join(input_dir, sector_file)
            df = pd.read_csv(file_path, parse_dates=['Date'])
            
            # Convert 'Date' to UTC
            df['Date'] = pd.to_datetime(df['Date'], utc=True)
            
            # Filter the data from 2019 onwards
            df = df[df['Date'] >= pd.Timestamp('2019-01-01', tz='UTC')]

            # Calculate YoY growth
            df = calculate_yoy_growth(df)

            # Calculate sector leader and fractional contributions
            sector_leader, sorted_companies, total_overperformance = calculate_sector_leader_and_rank(df, sector)

            # Calculate sector index variations
            sector_index_variations = calculate_sector_index_variations(df, sorted_companies, total_overperformance)

            # Calculate variance of YoY growth and positive/negative growth ratio
            variance, pos_to_neg_ratio = calculate_variance_and_positive_ratio(df)

            # Summarize the variation between weighted and simple averages
            avg_difference = sector_index_variations['Difference'].mean()

            # Save results for rankings
            rankings_variation[sector] = avg_difference
            rankings_variance[sector] = variance
            # rankings_positive_negative_ratio[sector] = pos_to_neg_ratio

    # Create a DataFrame from the rankings
    rankings_df = pd.DataFrame({
        'Sector': list(rankings_variation.keys()),
        'Variation (Weighted vs Simple Avg)': list(rankings_variation.values()),
        'Variance of YoY Growth': list(rankings_variance.values()),
        # 'Positive to Negative Growth Ratio': list(rankings_positive_negative_ratio.values())
    })

    # Save the rankings DataFrame to CSV
    rankings_df.to_csv(output_file, index=False)
    print(f"\nRankings saved to {output_file}")

    # Optionally, print the rankings (as before)
    print("\nRanking of sectors based on variation between simple and weighted averages:")
    for sector, value in sorted(rankings_variation.items(), key=lambda x: x[1], reverse=True):
        print(f"{sector}: {value}")

    print("\nRanking of sectors based on variance of YoY growth over time:")
    for sector, value in sorted(rankings_variance.items(), key=lambda x: x[1], reverse=True):
        print(f"{sector}: {value}")

    print("\nRanking of sectors based on ratio of positive to negative growth over time:")
    for sector, value in sorted(rankings_positive_negative_ratio.items(), key=lambda x: x[1], reverse=True):
        print(f"{sector}: {value}")
        
    rankings_df=rankings_df

def main():
    input_dir = "sector_mkt_cap_results"
    get_sector_rankings(input_dir)

if __name__ == "__main__":
    main()



In [None]:
rankings_df=pd.read_csv('sector_rankings.csv')
rankings_df

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

# Step 1: Read the CSV file
df = pd.read_csv('sector_rankings.csv')

# Step 2: Normalize each column independently (excluding the 'Sector' column)
scaler = StandardScaler()
X = df.drop('Sector', axis=1)  # Keep only numerical data for normalization
X_scaled = X.apply(lambda col: scaler.fit_transform(col.values.reshape(-1, 1)).flatten(), axis=0)

# Step 3: Perform hierarchical clustering
Z = linkage(X_scaled, method='ward')

# Step 4: Plot the dendrogram
plt.figure(figsize=(12, 20))
dendrogram(Z, labels=df['Sector'].values, leaf_rotation=90, leaf_font_size=10)
plt.title('Hierarchical Clustering of Sectors')
plt.xlabel('Sectors')
plt.ylabel('Euclidean Distance')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Step 1: Read the CSV file
df = pd.read_csv('sector_rankings.csv')

# Step 2: Normalize the data (excluding the 'Sector' column)
scaler = StandardScaler()
X = df.drop('Sector', axis=1)  # Keep only numerical data for normalization
X_scaled = scaler.fit_transform(X)

# Step 3: Calculate inertia for different values of k
inertia = []
k_values = range(1, 11)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)  # Sum of squared distances to closest cluster center

# Step 4: Plot the Elbow Method graph
plt.figure(figsize=(8, 6))
plt.plot(k_values, inertia, 'bo-', markersize=8)
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Read the CSV file
df = pd.read_csv('sector_rankings.csv')

# Step 2: Normalize the data (excluding the 'Sector' column)
scaler = StandardScaler()
X = df.drop('Sector', axis=1)  # Keep only numerical data for normalization
X_scaled = scaler.fit_transform(X)

# Step 3: Perform K-means clustering with k=7
kmeans = KMeans(n_clusters=4, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)  # Assign clusters to sectors

# Step 4: Show the resulting clusters
print(df[['Sector', 'Cluster']])

# Step 5: Visualize the clusters using a scatter plot (based on the first two principal components for visualization)
from sklearn.decomposition import PCA

pca = PCA(n_components=2)  # Reduce to 2D for visualization
X_pca = pca.fit_transform(X_scaled)

# Create a scatter plot with clusters
plt.figure(figsize=(10, 8))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df['Cluster'], palette='Set1', s=100, legend='full')
plt.title('K-means Clustering of Sectors (k=7)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Step 1: Read the CSV file
df = pd.read_csv('sector_rankings.csv')

# Step 2: Normalize the data (excluding the 'Sector' column)
scaler = StandardScaler()
X = df.drop('Sector', axis=1)  # Keep only numerical data for normalization
X_scaled = scaler.fit_transform(X)

# Step 3: Perform K-means clustering with k=7
kmeans = KMeans(n_clusters=7, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)  # Assign clusters to sectors

# Step 4: Sort the dataframe by the 'Cluster' column
df_sorted = df.sort_values(by='Cluster')

# Step 5: Save the sorted dataframe to a new CSV file
df_sorted.to_csv('sector_clusters_sorted.csv', index=False)

# Step 6: Display a message indicating successful saving
print("Cluster details saved to 'sector_clusters_sorted.csv'.")

In [None]:
def classify_growth(value):
    """Classify YoY growth into 1, 0, or -1 based on thresholds."""
    if value > 5:
        return 1
    elif value < -5:
        return -1
    else:
        return 0

def calculate_yoy_growth(df, column):
    """Calculate YoY growth and classify based on threshold."""
    df[f'{column}_YoY_Growth'] = df[column].pct_change(periods=4) * 100  # YoY percentage change
    df[f'{column}_Growth_Class'] = df[f'{column}_YoY_Growth'].apply(classify_growth)
    df.dropna(subset=[f'{column}_YoY_Growth'], inplace=True)
    return df

def calculate_beta_growth_covariance(df, period):
    """Calculate the covariance of beta growth over a specified period."""
    df[f'Beta_Growth_{period}_M'] = df['Beta'].pct_change(periods=period)
    beta_covariance = df[f'Beta_Growth_{period}_M'].cov(df['Beta'])
    return beta_covariance

def calculate_sector_index_variations(df, sorted_companies, total_overperformance):
    """Calculate sector index and return variations between weighted and simple average index."""
    fractional_contribution = {company: count**2 / total_overperformance for company, count in sorted_companies}
    
    sector_index = pd.DataFrame()

    for ticker in df['Ticker'].unique():
        company_data = df[df['Ticker'] == ticker].copy()

        if ticker in fractional_contribution:
            contribution = fractional_contribution[ticker]
            company_data['Weighted_YoY_Growth'] = company_data['MarketCap_Growth_Class'] * contribution
        else:
            continue

        if sector_index.empty:
            sector_index = company_data[['Date', 'Weighted_YoY_Growth']].copy()
        else:
            sector_index = pd.merge(sector_index, company_data[['Date', 'Weighted_YoY_Growth']],
                                    on='Date', how='outer', suffixes=('', f'_{ticker}'))

    sector_index['Weighted_Index'] = sector_index.filter(like='Weighted_YoY_Growth').sum(axis=1)

    # Calculate simple average
    simple_avg_index = df.groupby('Date')['MarketCap_Growth_Class'].mean().reset_index()
    sector_index = pd.merge(sector_index, simple_avg_index, on='Date', how='left')
    sector_index.rename(columns={'MarketCap_Growth_Class': 'Simple_Avg_Index'}, inplace=True)

    # Calculate the difference between the weighted and simple averages
    sector_index['Difference'] = (sector_index['Weighted_Index'] - sector_index['Simple_Avg_Index']).abs()

    return sector_index[['Date', 'Weighted_Index', 'Simple_Avg_Index', 'Difference']]

def calculate_variance_and_covariances(df):
    """Calculate variance of YoY growth and the covariances of beta growth."""
    # Variance of YoY growth in market cap
    variance = df['MarketCap_Growth_Class'].var()

    # Covariance for 6 months and 5 years of beta growth
    beta_cov_6m = calculate_beta_growth_covariance(df, period=6)
    beta_cov_5y = calculate_beta_growth_covariance(df, period=20)  # Assuming 5 years corresponds to approx. 20 quarters

    return variance, beta_cov_6m, beta_cov_5y

def get_sector_rankings(input_dir, output_file='sector_rankings.csv'):
    rankings_variation = {}
    rankings_variance = {}
    rankings_cov_6m_beta = {}
    rankings_cov_5y_beta = {}

    for sector_file in os.listdir(input_dir):
        if sector_file.endswith(".csv"):
            sector = sector_file.replace("_mkt_cap_quarter_end.csv", "")
            print(f"Processing {sector} sector...")

            # Load the CSV file and filter data for 2019-2024
            file_path = os.path.join(input_dir, sector_file)
            df = pd.read_csv(file_path, parse_dates=['Date'])
            
            # Convert 'Date' to UTC
            df['Date'] = pd.to_datetime(df['Date'], utc=True)
            
            # Filter the data from 2019 onwards
            df = df[df['Date'] >= pd.Timestamp('2019-01-01', tz='UTC')]

            # Calculate YoY growth for MarketCap and Revenue
            df = calculate_yoy_growth(df, 'MarketCap')
            df = calculate_yoy_growth(df, 'Revenue')

            # Calculate sector leader and fractional contributions
            sector_leader, sorted_companies, total_overperformance = calculate_sector_leader_and_rank(df, sector)

            # Calculate sector index variations
            sector_index_variations = calculate_sector_index_variations(df, sorted_companies, total_overperformance)

            # Calculate variance of YoY growth and beta covariances
            variance, beta_cov_6m, beta_cov_5y = calculate_variance_and_covariances(df)

            # Summarize the variation between weighted and simple averages
            avg_difference = sector_index_variations['Difference'].mean()

            # Save results for rankings
            rankings_variation[sector] = avg_difference
            rankings_variance[sector] = variance
            rankings_cov_6m_beta[sector] = beta_cov_6m
            rankings_cov_5y_beta[sector] = beta_cov_5y

    # Create a DataFrame from the rankings
    rankings_df = pd.DataFrame({
        'Sector': list(rankings_variation.keys()),
        'Variation (Weighted vs Simple Avg)': list(rankings_variation.values()),
        'Variance of YoY Growth': list(rankings_variance.values()),
        'Covariance of 6M Beta Growth': list(rankings_cov_6m_beta.values()),
        'Covariance of 5Y Beta Growth': list(rankings_cov_5y_beta.values())
    })

    # Save the rankings DataFrame to CSV
    rankings_df.to_csv(output_file, index=False)
    print(f"\nRankings saved to {output_file}")

    # Optionally, print the rankings (as before)
    print("\nRanking of sectors based on variation between simple and weighted averages:")
    for sector, value in sorted(rankings_variation.items(), key=lambda x: x[1], reverse=True):
        print(f"{sector}: {value}")

    print("\nRanking of sectors based on variance of YoY growth over time:")
    for sector, value in sorted(rankings_variance.items(), key=lambda x: x[1], reverse=True):
        print(f"{sector}: {value}")

    print("\nRanking of sectors based on covariance of 6M Beta Growth:")
    for sector, value in sorted(rankings_cov_6m_beta.items(), key=lambda x: x[1], reverse=True):
        print(f"{sector}: {value}")

    print("\nRanking of sectors based on covariance of 5Y Beta Growth:")
    for sector, value in sorted(rankings_cov_5y_beta.items(), key=lambda x: x[1], reverse=True):
        print(f"{sector}: {value}")

def main():
    input_dir = "merged_sector_data"
    get_sector_rankings(input_dir)

if __name__ == "__main__":
    main()



In [None]:
import pandas as pd
import numpy as np
import os
import seaborn as sns

def merge_sector_data(mkt_cap_dir, revenue_dir, output_dir="merged_sector_data"):
    """
    Merge market cap and revenue data for sectors where both datasets are available.
    Handles specific CSV structures with quarterly market cap and yearly revenue data.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    merged_sectors = {}
    
    # Get list of sectors from both directories
    mkt_cap_sectors = {f.split('_mkt_cap')[0] for f in os.listdir(mkt_cap_dir) if f.endswith('.csv')}
    revenue_sectors = {f.split('_revenue')[0] for f in os.listdir(revenue_dir) if f.endswith('.csv')}
    
    # Find common sectors
    common_sectors = mkt_cap_sectors.intersection(revenue_sectors)
    
    for sector in common_sectors:
        print(f"Processing sector: {sector}")
        
        # Read market cap data
        mkt_cap_file = f"{sector}_mkt_cap_quarter_end.csv"
        mkt_cap_path = os.path.join(mkt_cap_dir, mkt_cap_file)
        mkt_cap_df = pd.read_csv(mkt_cap_path)
        
        # Read revenue data
        revenue_file = f"{sector}_revenue.csv"
        revenue_path = os.path.join(revenue_dir, revenue_file)
        revenue_df = pd.read_csv(revenue_path)
        
        # Clean and convert dates
        try:
            # Handle the specific date format in market cap data
            mkt_cap_df['Date'] = pd.to_datetime(mkt_cap_df['Date'].str.split(' ').str[0])
        except AttributeError:
            mkt_cap_df['Date'] = pd.to_datetime(mkt_cap_df['Date'])
            
        revenue_df['date'] = pd.to_datetime(revenue_df['date'])
        
        # Extract year and quarter
        mkt_cap_df['year'] = pd.DatetimeIndex(mkt_cap_df['Date']).year
        mkt_cap_df['quarter'] = pd.DatetimeIndex(mkt_cap_df['Date']).quarter
        
        # Create a list to store merged data for each ticker
        merged_data = []
        
        # Get unique tickers from both datasets
        mkt_cap_df['Ticker'] = mkt_cap_df['Ticker'].str.upper()
        revenue_df['ticker'] = revenue_df['ticker'].str.upper()
        
        common_tickers = set(mkt_cap_df['Ticker']).intersection(set(revenue_df['ticker']))
        
        print(f"Found {len(common_tickers)} common tickers for {sector}")
        
        for ticker in common_tickers:
            ticker_mkt_cap = mkt_cap_df[mkt_cap_df['Ticker'] == ticker].copy()
            ticker_revenue = revenue_df[revenue_df['ticker'] == ticker].copy()
            
            for _, mkt_cap_row in ticker_mkt_cap.iterrows():
                matching_revenue = ticker_revenue[
                    (ticker_revenue['year'] == mkt_cap_row['year']) &
                    (ticker_revenue['quarter'] == mkt_cap_row['quarter'])
                ]
                
                if matching_revenue.empty:
                    yearly_revenue = ticker_revenue[
                        ticker_revenue['year'] == mkt_cap_row['year']
                    ]
                    if not yearly_revenue.empty:
                        revenue_value = yearly_revenue.iloc[-1]['revenue']
                        revenue_growth = yearly_revenue.iloc[-1].get('revenue_yoy_growth', np.nan)
                    else:
                        continue
                else:
                    revenue_value = matching_revenue.iloc[0]['revenue']
                    revenue_growth = matching_revenue.iloc[0].get('revenue_yoy_growth', np.nan)
                
                merged_row = {
                    'Date': mkt_cap_row['Date'],
                    'Year': mkt_cap_row['year'],
                    'Quarter': mkt_cap_row['quarter'],
                    'Ticker': ticker,
                    'MarketCap': mkt_cap_row['MarketCap'],
                    'Revenue': revenue_value,
                    'Revenue_YoY_Growth': revenue_growth,
                    'Company_Name': ticker_revenue.iloc[0].get('company_name', ticker)
                }
                merged_data.append(merged_row)
        
        if merged_data:
            merged_df = pd.DataFrame(merged_data)
            merged_df = merged_df.sort_values(['Date', 'Ticker'])
            merged_sectors[sector] = merged_df
            
            # Save merged data to a CSV file
            output_file_path = os.path.join(output_dir, f"{sector}_merged_data.csv")
            merged_df.to_csv(output_file_path, index=False)
            print(f"Successfully merged data for {sector} and saved to {output_file_path}")
        else:
            print(f"No matching data found for {sector}")
        
    return merged_sectors

def calculate_growth_indicator(value):
    """Convert growth percentage to indicator: 1 (>5%), -1 (<-5%), 0 (between -5% and 5%)"""
    if pd.isna(value):
        return 0
    if value > 5:
        return 1
    elif value < -5:
        return -1
    else:
        return 0

def calculate_beta_covariance(df, period_months):
    """
    Calculate covariance of beta over a specified period with improved handling of time series.
    """
    try:
        df = df.copy()
        
        # Ensure data is sorted by date
        df = df.sort_values(['Date', 'Ticker'])
        
        # Calculate returns for each company
        df['Returns'] = df.groupby('Ticker')['MarketCap'].pct_change()
        
        # Calculate market returns (using value-weighted market return)
        df['Market_Value'] = df.groupby('Date')['MarketCap'].transform('sum')
        df['Market_Weight'] = df['MarketCap'] / df['Market_Value']
        df['Market_Returns'] = df.groupby('Date')['Returns'].transform(lambda x: (x * df.loc[x.index, 'Market_Weight']).sum())
        
        # Set minimum periods for rolling calculations
        min_periods = max(2, period_months - 1)  # Ensure at least 2 periods for correlation
        rolling_window = period_months * 3  # Convert months to quarters (assuming quarterly data)
        
        betas_by_date = []
        
        for ticker in df['Ticker'].unique():
            ticker_data = df[df['Ticker'] == ticker].copy()
            
            if len(ticker_data) >= min_periods:
                # Calculate rolling betas
                rolling_cov = (
                    ticker_data['Returns']
                    .rolling(window=rolling_window, min_periods=min_periods)
                    .cov(ticker_data['Market_Returns'])
                )
                
                rolling_market_var = (
                    ticker_data['Market_Returns']
                    .rolling(window=rolling_window, min_periods=min_periods)
                    .var()
                )
                
                # To avoid dividing by zero, handle NaN or zero variance values
                ticker_data['Beta'] = rolling_cov / rolling_market_var.replace(0, np.nan)
                
                # Store results
                betas_by_date.append(ticker_data[['Date', 'Ticker', 'Beta']].dropna())
        
        if not betas_by_date:
            return 0
        
        # Combine all beta calculations
        all_betas = pd.concat(betas_by_date)
        
        # Create a pivot table of betas (companies x dates)
        beta_matrix = all_betas.pivot_table(
            index='Ticker',
            columns='Date',
            values='Beta',
            aggfunc='first'
        )
        
        # Remove companies with too many missing values
        min_observations = beta_matrix.shape[1] * 0.5  # Require at least 50% of dates
        beta_matrix = beta_matrix[beta_matrix.count(axis=1) >= min_observations]
        
        if beta_matrix.empty:
            return 0
        
        # Fill remaining NaN values with forward fill then backward fill
        beta_matrix = beta_matrix.fillna(method='ffill', axis=1).fillna(method='bfill', axis=1)
        
        # Calculate covariance between different dates
        cov_matrix = beta_matrix.T.cov()
        
        # Calculate average absolute covariance (excluding diagonal)
        mask = ~np.eye(cov_matrix.shape[0], dtype=bool)
        avg_cov = np.abs(cov_matrix.where(mask)).mean().mean()
        
        return float(avg_cov) if not np.isnan(avg_cov) else 0
        
    except Exception as e:
        print(f"Error in beta covariance calculation: {e}")
        return 0
    
def plot_covariance_heatmap(cov_matrix, title='Sector Covariance Heatmap'):
    """
    Plot a heatmap based on the covariance matrix.
    """
    plt.figure(figsize=(12, 8))
    sns.heatmap(cov_matrix, annot=True, cmap="coolwarm", fmt=".2f", cbar_kws={'label': 'Covariance'})
    plt.title(title)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

def calculate_sector_rankings(merged_sectors, output_file='sector_rankings.csv'):
    """Calculate and rank sectors based on the five specified parameters"""
    rankings = []
    
    for sector, df in merged_sectors.items():
        print(f"Processing sector: {sector}")
        try:
            # Ensure data is sorted chronologically
            df = df.sort_values('Date')
            
            # 1. Market Cap YoY Growth Indicator
            df['MktCap_YoY_Change'] = df.groupby('Ticker')['MarketCap'].pct_change(periods=4) * 100
            df['MktCap_Growth_Indicator'] = df['MktCap_YoY_Change'].apply(calculate_growth_indicator)
            
            # 2. Revenue YoY Growth Indicator
            df['Revenue_Growth_Indicator'] = df['Revenue_YoY_Growth'].apply(calculate_growth_indicator)
            
            # 3. Variance between weighted and simple average
            df['Weighted_MktCap_Change'] = (
                df['MktCap_YoY_Change'] * 
                df['MarketCap'] / 
                df.groupby('Date')['MarketCap'].transform('sum')
            )
            
            # Calculate averages only for non-NaN values
            weighted_avg = df.groupby('Date')['Weighted_MktCap_Change'].sum().mean()
            simple_avg = df['MktCap_YoY_Change'].mean()
            variance_avg = abs(weighted_avg - simple_avg)
            
            # 4 & 5. Beta covariances
            print(f"Calculating 6-month beta covariance for {sector}")
            beta_6m_cov = calculate_beta_covariance(df, 2)
            
            print(f"Calculating 5-year beta covariance for {sector}")
            beta_4y_cov = calculate_beta_covariance(df, 16)
            
            rankings.append({
                'Sector': sector,
                'MktCap_Growth_Score': df['MktCap_Growth_Indicator'].mean(),
                'Revenue_Growth_Score': df['Revenue_Growth_Indicator'].mean(),
                'Weighted_Simple_Variance': variance_avg,
                'Beta_6M_Covariance': beta_6m_cov,
                'Beta_4Y_Covariance': beta_4y_cov,
                'Number_of_Companies': len(df['Ticker'].unique()),
                'Date_Range': f"{df['Date'].min().strftime('%Y-%m-%d')} to {df['Date'].max().strftime('%Y-%m-%d')}"
            })
            
            print(f"Successfully processed {sector}")
            
        except Exception as e:
            print(f"Error processing sector {sector}: {e}")
            continue
    
    # Create rankings DataFrame and save to CSV
    rankings_df = pd.DataFrame(rankings)
    rankings_df.to_csv(output_file, index=False)
    return rankings_df

def main():
    mkt_cap_dir = "sector_mkt_cap_results"
    revenue_dir = "sector_revenue_results"
    
    # Merge sector data
    merged_sectors = merge_sector_data(mkt_cap_dir, revenue_dir)
    
    # Calculate rankings with new parameters
    rankings_df = calculate_sector_rankings(merged_sectors)
    
    # Print rankings for each parameter
    parameters = ['MktCap_Growth_Score', 'Revenue_Growth_Score', 'Weighted_Simple_Variance', 
                 'Beta_6M_Covariance', 'Beta_4Y_Covariance']
    
    for param in parameters:
        print(f"\nRanking of sectors based on {param}:")
        sorted_rankings = rankings_df.sort_values(param, ascending=False)
        for _, row in sorted_rankings.iterrows():
            print(f"{row['Sector']} ({row['Number_of_Companies']} companies): {row[param]:.4f}")
            print(f"Date Range: {row['Date_Range']}")

if __name__ == "__main__":
    main()

In [18]:
df = pd.read_csv('sector_rankings.csv')

In [None]:
df

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

# Step 1: Read the CSV file
df = pd.read_csv('sector_rankings.csv')

# Step 2: Normalize each column independently (excluding the 'Sector' column)
scaler = StandardScaler()
df = df.drop(columns=['Number_of_Companies', 'Date_Range'])
X = df.drop('Sector', axis=1)  # Keep only numerical data for normalization
print(df.shape)
X_scaled = X.apply(lambda col: scaler.fit_transform(col.values.reshape(-1, 1)).flatten(), axis=0)

# Step 3: Perform hierarchical clustering
Z = linkage(X_scaled, method='ward')

# Step 4: Plot the dendrogram
plt.figure(figsize=(12, 20))
dendrogram(Z, labels=df['Sector'].values, leaf_rotation=90, leaf_font_size=10)
plt.title('Hierarchical Clustering of Sectors')
plt.xlabel('Sectors')
plt.ylabel('Euclidean Distance')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from minisom import MiniSom
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import KLDivergence
import tensorflow as tf
from collections import Counter

# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Load data
data = pd.read_csv('sector_rankings.csv')
params = data[['MktCap_Growth_Score', 'Revenue_Growth_Score', 'Weighted_Simple_Variance', 'Beta_6M_Covariance', 'Beta_4Y_Covariance']]  # Replace with actual column names

# Scale the data
scaler = StandardScaler()
params_scaled = scaler.fit_transform(params)

# Initialize lists to store results
cluster_results = []
clusters_range = list(range(3, 8))

# Function to run Autoencoder + K-means
def kmeans_clustering(n_clusters, params_scaled):
    try:
        # Define the autoencoder
        input_dim = params_scaled.shape[1]
        encoding_dim = 3
        input_layer = Input(shape=(input_dim,))
        encoder = Dense(encoding_dim, activation="relu")(input_layer)
        decoder = Dense(input_dim, activation="sigmoid")(encoder)
        autoencoder = Model(inputs=input_layer, outputs=decoder)
        autoencoder.compile(optimizer="adam", loss="mse")
        autoencoder.fit(params_scaled, params_scaled, epochs=50, batch_size=16, shuffle=True)

        # Extract encoder and apply K-means clustering
        encoder_model = Model(inputs=input_layer, outputs=encoder)
        encoded_data = encoder_model.predict(params_scaled)
        
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        clusters = kmeans.fit_predict(encoded_data)
        
        return clusters
    except Exception as e:
        print(f"Error in KMeans clustering with {n_clusters} clusters: {e}")
        return None

# Function to run Self-Organizing Map (SOM)
def som_clustering(n_clusters, params_scaled):
    try:
        som_size = 10  # Grid size of SOM
        som = MiniSom(x=som_size, y=som_size, input_len=params_scaled.shape[1], sigma=1.0, learning_rate=0.5)
        som.random_weights_init(params_scaled)
        som.train_random(params_scaled, 100)

        # Assign clusters based on the winning neuron
        clusters = []
        for sample in params_scaled:
            x, y = som.winner(sample)
            clusters.append((x * som_size) + y)

        return clusters
    except Exception as e:
        print(f"Error in SOM clustering with {n_clusters} clusters: {e}")
        return None

# Function to run Deep Embedded Clustering (DEC)
def dec_clustering(n_clusters, params_scaled):
    try:
        # Define the autoencoder architecture for DEC
        input_dim = params_scaled.shape[1]
        encoding_dim = 3
        input_layer = Input(shape=(input_dim,))
        encoder = Dense(encoding_dim, activation='relu')(input_layer)
        decoder = Dense(input_dim, activation='sigmoid')(encoder)
        autoencoder = Model(inputs=input_layer, outputs=decoder)
        autoencoder.compile(optimizer='adam', loss='mse')
        autoencoder.fit(params_scaled, params_scaled, epochs=50, batch_size=16, shuffle=True)

        encoder_model = Model(inputs=input_layer, outputs=encoder)
        encoded_data = encoder_model.predict(params_scaled)

        clustering_layer = Dense(n_clusters, activation='softmax', name='clustering')(encoder)
        dec_model = Model(inputs=input_layer, outputs=clustering_layer)
        dec_model.compile(optimizer=Adam(learning_rate=0.001), loss=KLDivergence())

        initial_clusters = np.random.randint(0, n_clusters, size=params_scaled.shape[0])

        # DEC training loop
        for epoch in range(100):
            cluster_probs = dec_model.predict(params_scaled)
            soft_labels = cluster_probs
            loss = dec_model.train_on_batch(params_scaled, soft_labels)
            
            if epoch % 10 == 0:
                print(f'Epoch {epoch}/{100}, Loss: {loss}')

        final_clusters = np.argmax(dec_model.predict(params_scaled), axis=1)
        return final_clusters
    except Exception as e:
        print(f"Error in DEC clustering with {n_clusters} clusters: {e}")
        return None

# Run 50 times to collect clustering results
for run in range(3):
    print(f"Running iteration {run+1}/50")

    run_results = {}

    # For each number of clusters in the range, run KMeans, SOM, and DEC clustering
    for n_clusters in clusters_range:
        print(f"Evaluating for {n_clusters} clusters:")

        # K-means clustering and get the clusters
        kmeans_clusters = kmeans_clustering(n_clusters, params_scaled)
        if kmeans_clusters is not None:
            run_results['kmeans'] = tuple(kmeans_clusters)  # Store the cluster assignments

        # SOM clustering and get the clusters
        som_clusters = som_clustering(n_clusters, params_scaled)
        if som_clusters is not None:
            run_results['som'] = tuple(som_clusters)

        # DEC clustering and get the clusters
        dec_clusters = dec_clustering(n_clusters, params_scaled)
        if dec_clusters is not None:
            run_results['dec'] = tuple(dec_clusters)

    # Append the results for this run to cluster_results
    cluster_results.append(run_results)

# Count the frequency of each unique cluster configuration
cluster_frequencies = {}

for result in cluster_results:
    for method, clusters in result.items():
        if clusters in cluster_frequencies:
            cluster_frequencies[clusters] += 1
        else:
            cluster_frequencies[clusters] = 1

# Find the most frequent configuration
max_frequency = max(cluster_frequencies.values())
winning_clusters = [clusters for clusters, freq in cluster_frequencies.items() if freq == max_frequency]

# Calculate the percentage of times the winner configuration occurred
winning_percentage = (max_frequency / 50) * 100

print(f"Winning clustering configuration occurred {max_frequency} times, i.e., {winning_percentage:.2f}% of the time.")
print(f"Winner configurations: {winning_clusters}")

# Plot the results (silhouette score for each method)
plt.figure(figsize=(10, 6))
for n_clusters in clusters_range:
    silhouette_kmeans = []
    silhouette_som = []
    silhouette_dec = []
    for run in range(5):
        result = cluster_results[run]
        if 'kmeans' in result and len(result['kmeans']) > 0:
            silhouette_kmeans.append(silhouette_score(params_scaled, result['kmeans']))
        if 'som' in result and len(result['som']) > 0:
            silhouette_som.append(silhouette_score(params_scaled, result['som']))
        if 'dec' in result and len(result['dec']) > 0:
            silhouette_dec.append(silhouette_score(params_scaled, result['dec']))

    # Calculate average silhouette scores for each method
    avg_silhouette_kmeans = np.mean(silhouette_kmeans) if silhouette_kmeans else 0
    avg_silhouette_som = np.mean(silhouette_som) if silhouette_som else 0
    avg_silhouette_dec = np.mean(silhouette_dec) if silhouette_dec else 0

    # Plot the results
    plt.plot(clusters_range, avg_silhouette_kmeans, label='Autoencoder + K-Means', marker='o')
    plt.plot(clusters_range, avg_silhouette_som, label='SOM', marker='s')
    plt.plot(clusters_range, avg_silhouette_dec, label='DEC', marker='^')

plt.xlabel('Number of Clusters')
plt.ylabel('Average Silhouette Score')
plt.title('Average Silhouette Scores for Different Clustering Algorithms (50 Runs)')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
from collections import Counter
import matplotlib.pyplot as plt

# Initialize a list to store the optimal cluster count for each run
optimal_clusters = []

# Number of runs
num_runs = 20

for i in range(num_runs):
    silhouette_scores = {}

    for n_clusters in clusters_range:
        # Perform K-means clustering with Autoencoder and calculate silhouette score
        score = kmeans_clustering(n_clusters, params_scaled)
        if score is not None:
            silhouette_scores[n_clusters] = score

    # Find the number of clusters with the highest silhouette score for this run
    optimal_cluster = max(silhouette_scores, key=silhouette_scores.get)
    optimal_clusters.append(optimal_cluster)

# Count the frequency of each optimal cluster count
optimal_cluster_counts = Counter(optimal_clusters)

# Plot a histogram of the optimal cluster counts
plt.figure(figsize=(10, 6))
plt.hist(optimal_clusters, bins=len(clusters_range), edgecolor='black', alpha=0.7)
plt.xlabel('Optimal Number of Clusters')
plt.ylabel('Frequency')
plt.title('Distribution of Optimal Cluster Counts (Autoencoder + K-Means)')
plt.xticks(clusters_range)
plt.grid(axis='y')
plt.show()


In [None]:
from collections import Counter
import matplotlib.pyplot as plt

# Initialize a list to store the optimal cluster count for each run
optimal_clusters = []

# Number of runs
num_runs = 50

for i in range(num_runs):
    silhouette_scores = {}

    for n_clusters in clusters_range:
        # Perform K-means clustering with Autoencoder and calculate silhouette score
        score = kmeans_clustering(n_clusters, params_scaled)
        if score is not None:
            silhouette_scores[n_clusters] = score

    # Find the number of clusters with the highest silhouette score for this run
    optimal_cluster = max(silhouette_scores, key=silhouette_scores.get)
    optimal_clusters.append(optimal_cluster)

# Count the frequency of each optimal cluster count
optimal_cluster_counts = Counter(optimal_clusters)

# Plot a histogram of the optimal cluster counts
plt.figure(figsize=(10, 6))
plt.hist(optimal_clusters, bins=len(clusters_range), edgecolor='black', alpha=0.7)
plt.xlabel('Optimal Number of Clusters')
plt.ylabel('Frequency')
plt.title('Distribution of Optimal Cluster Counts (Autoencoder + K-Means)')
plt.xticks(clusters_range)
plt.grid(axis='y')
plt.show()


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
import numpy as np

# Step 1: Load and preprocess the data
df = pd.read_csv('sector_rankings.csv')
df = df.drop(columns=['Number_of_Companies', 'Date_Range'])  # Drop unnecessary columns

# Separate 'Sector' column for labeling
sectors = df['Sector'].values
X = df.drop('Sector', axis=1)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Build the autoencoder
input_dim = X_scaled.shape[1]
encoding_dim = 5  # Adjust based on desired compression level

# Encoder
input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='relu')(input_layer)

# Decoder
decoded = Dense(input_dim, activation='linear')(encoded)

# Autoencoder model
autoencoder = Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# Train the autoencoder
autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=8, shuffle=True, validation_split=0.2)

# Encoder model to get the compressed representation
encoder = Model(inputs=input_layer, outputs=encoded)

# Step 3: Run clustering multiple times and record clusters
num_iterations = 1000
num_clusters = 5  # Set desired number of clusters
all_clusterings = []

for i in range(num_iterations):
    # Get the encoded (compressed) data
    X_encoded = encoder.predict(X_scaled)
    
    # Perform KMeans clustering on the encoded data
    kmeans = KMeans(n_clusters=num_clusters, random_state=i)  # Use a different random state each time
    clusters = kmeans.fit_predict(X_encoded)
    
    # Store the clustering result
    all_clusterings.append(clusters)

# Convert clustering results into a DataFrame for easier analysis
cluster_df = pd.DataFrame(all_clusterings).T
cluster_df.columns = [f'Iteration_{i+1}' for i in range(num_iterations)]
cluster_df.index = sectors

# Step 4: Analyze consistency of clusters across iterations
# Find sectors that are clustered together in every iteration
consistent_pairs = []
for i in range(len(sectors)):
    for j in range(i + 1, len(sectors)):
        sector1, sector2 = sectors[i], sectors[j]
        
        # Check if the two sectors are clustered together in every iteration
        consistently_clustered = all((cluster_df.iloc[i] == cluster_df.iloc[j]).values)
        
        if consistently_clustered:
            consistent_pairs.append((sector1, sector2))

# Count the number of unique sectors that were consistent in every clustering
consistent_sectors = set([sector for pair in consistent_pairs for sector in pair])
inconsistent_sectors = set(sectors) - consistent_sectors

# Step 5: Output results
print("Sectors that were consistently clustered together in all iterations:")
if consistent_sectors:
    print(", ".join(consistent_sectors))
else:
    print("No sectors were consistently clustered together every time.")

print("\nNumber of sectors that were consistent:", len(consistent_sectors))
print("Number of sectors that were not consistent:", len(inconsistent_sectors))
print("Inconsistent sectors:", ", ".join(inconsistent_sectors))


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from fcmeans import FCM
import warnings

warnings.filterwarnings("ignore")  # Ignore warnings for cleaner output

# Step 1: Load and preprocess the data
df = pd.read_csv('sector_rankings.csv')
df = df.drop(columns=['Number_of_Companies', 'Date_Range'])  # Drop unnecessary columns

# Separate 'Sector' column for labeling
sectors = df['Sector'].values
X = df.drop('Sector', axis=1)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Define the fuzzy clustering parameters
num_clusters = 5  # Define the number of clusters you want
num_iterations = 10
membership_results = []

# Step 3: Perform Fuzzy C-Means clustering multiple times
for _ in range(num_iterations):
    fcm = FCM(n_clusters=num_clusters, m=2)
    fcm.fit(X_scaled)
    membership_results.append(fcm.u)  # Membership matrix for each iteration

# Step 4: Calculate the average membership matrix
avg_membership = np.mean(np.array(membership_results), axis=0)

# Step 5: Analyze consistent clusters
# For each sector, find the cluster with the highest average membership
sector_clusters = {}
for idx, sector in enumerate(sectors):
    max_cluster = np.argmax(avg_membership[idx])
    max_membership = avg_membership[idx][max_cluster]
    sector_clusters[sector] = (max_cluster, max_membership)

# Identify sectors that consistently belong to the same cluster
consistent_sectors = [sector for sector, (cluster, membership) in sector_clusters.items() if membership > 0.8]
inconsistent_sectors = [sector for sector in sectors if sector not in consistent_sectors]

# Step 6: Output results
print("Sectors with high membership consistency in a single cluster:")
for sector in consistent_sectors:
    cluster, membership = sector_clusters[sector]
    print(f"{sector} - Cluster: {cluster}, Membership: {membership:.2f}")

print("\nSectors with mixed memberships across clusters (inconsistent):")
for sector in inconsistent_sectors:
    memberships = avg_membership[sectors.tolist().index(sector)]
    print(f"{sector} - Cluster memberships: {[f'{m:.2f}' for m in memberships]}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances_argmin_min
import warnings

warnings.filterwarnings("ignore")  # Ignore warnings for cleaner output

# Step 1: Load and preprocess the data
df = pd.read_csv('sector_rankings.csv')
df = df.drop(columns=['Number_of_Companies', 'Date_Range'])  # Drop unnecessary columns

# Separate 'Sector' column for labeling
sectors = df['Sector'].values
X = df.drop('Sector', axis=1)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Define seed sectors and calculate initial centroids
seed_sectors = {
    0: "Internet Services & Infrastructure",
    1: "Oil & Gas Exploration & Production",
    2: "Interactive Media & Services",
    3: "Broadcasting",
    4: "Rail Transportation"
}

# Find the rows corresponding to the seed sectors
seed_indices = [list(sectors).index(seed) for seed in seed_sectors.values()]
initial_centroids = X_scaled[seed_indices]

# Step 3: Iteratively perform KMeans until convergence
num_clusters = 5
centroids = initial_centroids
tolerance = 1e-4  # Convergence threshold
max_iterations = 100  # Safety limit on iterations
iteration = 0

while iteration < max_iterations:
    # Step 3a: Assign each point to the nearest centroid
    labels, _ = pairwise_distances_argmin_min(X_scaled, centroids)
    
    # Step 3b: Calculate new centroids as the mean of points in each cluster
    new_centroids = np.array([X_scaled[labels == k].mean(axis=0) for k in range(num_clusters)])
    
    # Step 3c: Check for convergence (if centroids do not change significantly)
    centroid_shift = np.linalg.norm(new_centroids - centroids, axis=1).max()
    print(f"Iteration {iteration + 1}, centroid shift: {centroid_shift:.6f}")
    
    if centroid_shift < tolerance:
        print("Convergence reached.")
        break
    
    centroids = new_centroids
    iteration += 1

# Final labels after convergence
final_labels = labels

# Step 4: Reduce dimensions for plotting using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
centroids_pca = pca.transform(centroids)

# Step 5: Plot the final clusters
plt.figure(figsize=(10, 7))
for cluster in range(num_clusters):
    # Plot points in each cluster
    cluster_points = X_pca[final_labels == cluster]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f"Cluster {cluster}")

print(cluster)
# Plot centroids
plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1], s=200, c='black', marker='X', label='Centroids')

# Add labels and title
plt.title("Final Clusters after KMeans Convergence")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

# Step 1: Load and preprocess the data
df = pd.read_csv('sector_rankings.csv')
df = df.drop(columns=['Number_of_Companies', 'Date_Range'])  # Drop unnecessary columns

# Separate 'Sector' column for labeling and features
sectors = df['Sector'].values
X = df.drop('Sector', axis=1)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Generate cluster labels with KMeans
num_clusters = 5
seed_sectors = {
    0: "Internet Services & Infrastructure",
    1: "Oil & Gas Exploration & Production",
    2: "Interactive Media & Services",
    3: "Broadcasting",
    4: "Rail Transportation"
}

# Find the rows corresponding to the seed sectors
seed_indices = [list(sectors).index(seed) for seed in seed_sectors.values()]
initial_centroids = X_scaled[seed_indices]

# Initialize and fit KMeans
kmeans = KMeans(n_clusters=num_clusters, init=initial_centroids, n_init=1)
kmeans.fit(X_scaled)
labels = kmeans.labels_  # Use these labels as the target for training

# Step 3: Prepare data for the neural network
# Convert labels to categorical (one-hot encoding)
y = to_categorical(labels, num_clusters)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.33, random_state=42)

# Step 4: Build the neural network model
model = Sequential([
    Dense(64, input_shape=(X_train.shape[1],), activation='relu'),
    Dense(32, activation='relu'),
    Dense(num_clusters, activation='softmax')  # Output layer with softmax for classification
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Step 5: Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=8, validation_split=0.2, verbose=1)

# Step 6: Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Predict clusters for new data points
predictions = model.predict(X_test)
predicted_clusters = np.argmax(predictions, axis=1)

# Output some test predictions
for i in range(len(y_test)):  # Show first 5 predictions
    print(f"True cluster: {np.argmax(y_test[i])}, Predicted cluster: {predicted_clusters[i]}")


In [20]:
model.save("sector_classification_model.keras")

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from fcmeans import FCM
import warnings

warnings.filterwarnings("ignore")  # Ignore warnings for cleaner output

# Step 1: Load and preprocess the data
df = pd.read_csv('sector_rankings.csv')
df = df.drop(columns=['Number_of_Companies', 'Date_Range'])  # Drop unnecessary columns

# Separate 'Sector' column for labeling
sectors = df['Sector'].values
X = df.drop('Sector', axis=1)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Define the fuzzy clustering parameters
num_clusters = 5  # Define the number of clusters you want
num_iterations = 10
membership_results = []

# Step 3: Perform Fuzzy C-Means clustering multiple times
for _ in range(num_iterations):
    fcm = FCM(n_clusters=num_clusters, m=2)
    fcm.fit(X_scaled)
    membership_results.append(fcm.u)  # Membership matrix for each iteration

# Step 4: Calculate the average membership matrix
avg_membership = np.mean(np.array(membership_results), axis=0)

# Define cluster labels
cluster_labels = {
    0: "Internet Service and Infrastructure",
    1: "Oil & Gas Exploration & Production",
    2: "Interactive Media & Services",
    3: "Broadcasting",
    4: "Rail Transportation"
}

# Step 5: Analyze consistent clusters
# For each sector, find the cluster with the highest average membership
sector_clusters = {}
for idx, sector in enumerate(sectors):
    max_cluster = np.argmax(avg_membership[idx])
    max_membership = avg_membership[idx][max_cluster]
    sector_clusters[sector] = (max_cluster, max_membership)

# Identify sectors that consistently belong to the same cluster
consistent_sectors = [sector for sector, (cluster, membership) in sector_clusters.items() if membership > 0.8]
inconsistent_sectors = [sector for sector in sectors if sector not in consistent_sectors]

# Step 6: Output results with cluster labels
print("Sectors with high membership consistency in a single cluster:")
for sector in consistent_sectors:
    cluster, membership = sector_clusters[sector]
    cluster_name = cluster_labels[cluster]
    print(f"{sector} - Cluster: {cluster_name} ({cluster}), Membership: {membership:.2f}")

print("\nSectors with mixed memberships across clusters (inconsistent):")
for sector in inconsistent_sectors:
    memberships = avg_membership[sectors.tolist().index(sector)]
    membership_list = [f'{m:.2f}' for m in memberships]
    print(f"{sector} - Cluster memberships: {membership_list}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from fcmeans import FCM
import warnings

warnings.filterwarnings("ignore")  # Ignore warnings for cleaner output

# Step 1: Load and preprocess the data
df = pd.read_csv('sector_rankings.csv')
df = df.drop(columns=['Number_of_Companies', 'Date_Range'])  # Drop unnecessary columns

# Separate 'Sector' column for labeling
sectors = df['Sector'].values
X = df.drop('Sector', axis=1)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Define the fuzzy clustering parameters
num_clusters = 5  # Define the number of clusters you want
num_iterations = 10
membership_results = []

# Step 3: Perform Fuzzy C-Means clustering multiple times
for _ in range(num_iterations):
    fcm = FCM(n_clusters=num_clusters, m=2)
    fcm.fit(X_scaled)
    membership_results.append(fcm.u)  # Membership matrix for each iteration

# Step 4: Calculate the average membership matrix
avg_membership = np.mean(np.array(membership_results), axis=0)

# Step 5: Track sector consistency across clusters
# For each sector, count how often it appears in the same cluster in different iterations
consistent_clusters = {}
for sector_idx in range(len(sectors)):
    sector_memberships = [np.argmax(membership[sector_idx]) for membership in membership_results]
    most_common_cluster = max(set(sector_memberships), key=sector_memberships.count)
    consistency_ratio = sector_memberships.count(most_common_cluster) / num_iterations
    consistent_clusters[sectors[sector_idx]] = (most_common_cluster, consistency_ratio)

# Define cluster labels
cluster_labels = {
    0: "Internet Service and Infrastructure",
    1: "Oil & Gas Exploration & Production",
    2: "Interactive Media & Services",
    3: "Broadcasting",
    4: "Rail Transportation"
}

# Identify sectors that consistently belong to the same cluster
consistent_sectors = [sector for sector, (_, ratio) in consistent_clusters.items() if ratio > 0.8]
inconsistent_sectors = [sector for sector in sectors if sector not in consistent_sectors]

# Step 6: Output results with cluster labels
print("Sectors with high membership consistency in a single cluster:")
for sector in consistent_sectors:
    cluster, ratio = consistent_clusters[sector]
    cluster_name = cluster_labels[cluster]
    print(f"{sector} - Cluster: {cluster_name} ({cluster}), Consistency: {ratio:.2f}")

print("\nSectors with mixed memberships across clusters (inconsistent):")
for sector in inconsistent_sectors:
    memberships = avg_membership[sectors.tolist().index(sector)]
    membership_list = [f'{m:.2f}' for m in memberships]
    print(f"{sector} - Cluster memberships: {membership_list}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from fcmeans import FCM
import warnings
from scipy.optimize import linear_sum_assignment

warnings.filterwarnings("ignore")  # Ignore warnings for cleaner output

# Step 1: Load and preprocess the data
df = pd.read_csv('sector_rankings.csv')
df = df.drop(columns=['Number_of_Companies', 'Date_Range'])  # Drop unnecessary columns

# Separate 'Sector' column for labeling
sectors = df['Sector'].values
X = df.drop('Sector', axis=1)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Define the fuzzy clustering parameters
num_clusters = 5  # Define the number of clusters you want
num_iterations = 10
membership_results = []

# Step 3: Perform Fuzzy C-Means clustering and align clusters based on composition similarity
for i in range(num_iterations):
    fcm = FCM(n_clusters=num_clusters, m=2)
    fcm.fit(X_scaled)
    membership_matrix = fcm.u

    # Align clusters based on the first iteration
    if i == 0:
        reference_matrix = membership_matrix.copy()
    else:
        # Use the Hungarian algorithm to match clusters based on maximum similarity
        cost_matrix = -np.dot(reference_matrix.T, membership_matrix)  # Negative for maximum matching
        row_ind, col_ind = linear_sum_assignment(cost_matrix)
        membership_matrix = membership_matrix[:, col_ind]

    membership_results.append(membership_matrix)  # Store aligned membership matrix

# Step 4: Calculate the average membership matrix
avg_membership = np.mean(np.array(membership_results), axis=0)

# Step 5: Analyze consistent clusters
# For each sector, find the cluster with the highest average membership
sector_clusters = {}
for idx, sector in enumerate(sectors):
    max_cluster = np.argmax(avg_membership[idx])
    max_membership = avg_membership[idx][max_cluster]
    sector_clusters[sector] = (max_cluster, max_membership)

# Identify sectors that consistently belong to the same cluster
consistent_sectors = [sector for sector, (cluster, membership) in sector_clusters.items() if membership > 0.8]
inconsistent_sectors = [sector for sector in sectors if sector not in consistent_sectors]

# Step 6: Output results
print("Sectors with high membership consistency in a single cluster:")
for sector in consistent_sectors:
    cluster, membership = sector_clusters[sector]
    print(f"{sector} - Cluster: {cluster}, Membership: {membership:.2f}")

print("\nSectors with mixed memberships across clusters (inconsistent):")
for sector in inconsistent_sectors:
    memberships = avg_membership[sectors.tolist().index(sector)]
    print(f"{sector} - Cluster memberships: {[f'{m:.2f}' for m in memberships]}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from fcmeans import FCM
import warnings

warnings.filterwarnings("ignore")  # Ignore warnings for cleaner output

# Step 1: Load and preprocess the data
df = pd.read_csv('sector_rankings.csv')
df = df.drop(columns=['Number_of_Companies', 'Date_Range'])  # Drop unnecessary columns

# Separate 'Sector' column for labeling
sectors = df['Sector'].values
X = df.drop('Sector', axis=1)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Define the fuzzy clustering parameters
num_clusters = 2  # Define the number of clusters you want
num_iterations = 10
membership_results = []

# Define your seeding clusters (e.g., manually assigning sectors to clusters)
initial_seeds = {
    "Tech": 0,  # Example: Place "Tech" sector in cluster 0
    "Finance": 1,  # Place "Finance" sector in cluster 1
    # Add more sectors as needed
}

# Step 3: Perform Fuzzy C-Means clustering multiple times
for _ in range(num_iterations):
    fcm = FCM(n_clusters=num_clusters, m=2)
    fcm.fit(X_scaled)
    
    # Apply seeding to the membership matrix after initial fit
    for sector, cluster in initial_seeds.items():
        sector_idx = sectors.tolist().index(sector)  # Find index of the sector
        fcm.u[sector_idx] = 0  # Set all memberships to 0 initially
        fcm.u[sector_idx][cluster] = 1  # Assign full membership to the chosen cluster

    membership_results.append(fcm.u)  # Membership matrix for each iteration

# Step 4: Calculate the average membership matrix
avg_membership = np.mean(np.array(membership_results), axis=0)

# Step 5: Analyze consistent clusters
# For each sector, find the cluster with the highest average membership
sector_clusters = {}
for idx, sector in enumerate(sectors):
    max_cluster = np.argmax(avg_membership[idx])
    max_membership = avg_membership[idx][max_cluster]
    sector_clusters[sector] = (max_cluster, max_membership)

# Identify sectors that consistently belong to the same cluster
consistent_sectors = [sector for sector, (cluster, membership) in sector_clusters.items() if membership > 0.8]
inconsistent_sectors = [sector for sector in sectors if sector not in consistent_sectors]

# Step 6: Output results
print("Sectors with high membership consistency in a single cluster:")
for sector in consistent_sectors:
    cluster, membership = sector_clusters[sector]
    print(f"{sector} - Cluster: {cluster}, Membership: {membership:.2f}")

print("\nSectors with mixed memberships across clusters (inconsistent):")
for sector in inconsistent_sectors:
    memberships = avg_membership[sectors.tolist().index(sector)]
    print(f"{sector} - Cluster memberships: {[f'{m:.2f}' for m in memberships]}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from fcmeans import FCM
import warnings

warnings.filterwarnings("ignore")  # Ignore warnings for cleaner output

def get_consistent_clusters(df, num_clusters=2, num_iterations=10, membership_threshold=0.8):
    """
    Perform Fuzzy C-Means clustering on the data and identify consistent clusters.
    
    Parameters:
    df (pandas.DataFrame): The input data.
    num_clusters (int): The number of clusters to identify.
    num_iterations (int): The number of times to run the clustering algorithm.
    membership_threshold (float): The minimum membership threshold for a data point to be considered part of a consistent cluster.
    
    Returns:
    dict: A dictionary where the keys are the consistent cluster indices and the values are lists of the data points (rows) that belong to those clusters.
    """
    # Preprocess the data
    X = df.drop('Sector', axis=1)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Perform Fuzzy C-Means clustering multiple times
    membership_results = []
    for _ in range(num_iterations):
        fcm = FCM(n_clusters=num_clusters, m=2)
        fcm.fit(X_scaled)
        membership_results.append(fcm.u)

    # Calculate the average membership matrix
    avg_membership = np.mean(np.array(membership_results), axis=0)

    # Identify consistent clusters
    consistent_clusters = {}
    for i in range(num_clusters):
        consistent_members = [idx for idx, membership in enumerate(avg_membership[:, i]) if membership >= membership_threshold]
        if consistent_members:
            consistent_clusters[i] = consistent_members

    return consistent_clusters

# Load and preprocess the data
df = pd.read_csv('sector_rankings.csv')
df = df.drop(columns=['Number_of_Companies', 'Date_Range'])

# Get consistent clusters
consistent_clusters = get_consistent_clusters(df, num_clusters=2, num_iterations=10, membership_threshold=0.8)

# Print results
print("Consistent Clusters:")
for cluster_idx, members in consistent_clusters.items():
    print(f"Cluster {cluster_idx}: {', '.join(df.iloc[members]['Sector'])}")

print("\nInconsistent Sectors:")
all_members = [member for members in consistent_clusters.values() for member in members]
inconsistent_sectors = [sector for idx, sector in enumerate(df['Sector']) if idx not in all_members]
print(', '.join(inconsistent_sectors))

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from fcmeans import FCM
from collections import defaultdict
import warnings

warnings.filterwarnings("ignore")  # Ignore warnings for cleaner output

# Step 1: Load and preprocess the data
df = pd.read_csv('sector_rankings.csv')
df = df.drop(columns=['Number_of_Companies', 'Date_Range'])  # Drop unnecessary columns

# Separate 'Sector' column for labeling
sectors = df['Sector'].values
X = df.drop('Sector', axis=1)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Define the fuzzy clustering parameters
num_clusters = 6  # Define the number of clusters you want
num_iterations = 10
cluster_assignments = []  # Store cluster assignments (company indices)

# Step 3: Perform Fuzzy C-Means clustering multiple times
for _ in range(num_iterations):
    fcm = FCM(n_clusters=num_clusters, m=2)
    fcm.fit(X_scaled)

    # Get cluster assignments (indices of companies in each cluster)
    labels = fcm.predict(X_scaled)
    clusters = defaultdict(set)
    for i, label in enumerate(labels):
        clusters[label].add(i)
    cluster_assignments.append(clusters)

# Step 4: Calculate cluster similarity across iterations
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

# Compare clusters across iterations to find consistent groupings
consistent_clusters = []
for i in range(num_iterations):
    for j in range(i + 1, num_iterations):
        for cluster1 in cluster_assignments[i]:
            for cluster2 in cluster_assignments[j]:
                similarity = jaccard_similarity(cluster_assignments[i][cluster1], cluster_assignments[j][cluster2])
                if similarity > 0.8:  # Threshold for consistency
                    consistent_clusters.append((i, cluster1, j, cluster2, similarity))

# Step 5: Assign sectors to consistent clusters
sector_clusters = {}
for i, sector in enumerate(sectors):
    cluster_memberships = []
    for iteration, cluster1, _, cluster2, _ in consistent_clusters:
        if i in cluster_assignments[iteration][cluster1]:
            cluster_memberships.append(cluster1)
        if i in cluster_assignments[iteration][cluster2]:
            cluster_memberships.append(cluster2)

    # Find the most frequent cluster for the sector
    if cluster_memberships:
        most_frequent_cluster = max(set(cluster_memberships), key=cluster_memberships.count)
        sector_clusters[sector] = most_frequent_cluster

# Step 6: Output results
print("Sectors and their assigned consistent clusters:")
for sector, cluster in sector_clusters.items():
    print(f"{sector} - Cluster: {cluster}")

# (Optional) Further analysis of consistent_clusters to identify 
# the specific companies driving the consistent groupings

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from fcmeans import FCM
from sklearn.decomposition import PCA
from collections import defaultdict

# Step 1: Load and preprocess the data
df = pd.read_csv('sector_rankings.csv')
df = df.drop(columns=['Number_of_Companies', 'Date_Range'])  # Drop unnecessary columns

# Separate 'Sector' column for labeling
sectors = df['Sector'].values
X = df.drop('Sector', axis=1)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Define the fuzzy clustering parameters
num_clusters = 6  # Define the number of clusters you want
num_iterations = 1

# Step 3: Perform Fuzzy C-Means clustering multiple times
sector_pairs = defaultdict(int)  # Track frequency of sector pair clustering
cluster_assignments = []         # Track individual cluster assignments

for _ in range(num_iterations):
    fcm = FCM(n_clusters=num_clusters, m=2)
    fcm.fit(X_scaled)

    # Get cluster assignments
    labels = fcm.predict(X_scaled)
    clusters = defaultdict(set)
    for i, label in enumerate(labels):
        clusters[label].add(i)

    # Record pairwise sector clustering counts
    for cluster in clusters.values():
        for sector1 in cluster:
            for sector2 in cluster:
                if sector1 != sector2:
                    sector_pairs[(sector1, sector2)] += 1

# Step 4: Create the cluster strength matrix
num_sectors = len(sectors)
strength_matrix = np.zeros((num_sectors, num_sectors))
for (sector1, sector2), count in sector_pairs.items():
    strength_matrix[sector1, sector2] = count / num_iterations  # Normalize by number of iterations

# Step 5: Plot the heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(strength_matrix, cmap="YlGnBu", xticklabels=sectors, yticklabels=sectors)
plt.title("Cluster Strength Heatmap for Sectors")
plt.xlabel("Sectors")
plt.ylabel("Sectors")
plt.show()

# Step 6: Perform PCA for visualization of segregated sectors
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Use consistent cluster assignments (final most frequent cluster from previous code)
sector_clusters = {}  # Mapping from sector name to its most consistent cluster

for i, sector in enumerate(sectors):
    cluster_memberships = [cluster for iteration, cluster1, _, cluster2, _ in consistent_clusters if i in cluster_assignments[iteration][cluster1]]
    if cluster_memberships:
        most_frequent_cluster = max(set(cluster_memberships), key=cluster_memberships.count)
        sector_clusters[sector] = most_frequent_cluster

# Plot the segregated sectors based on PCA results
plt.figure(figsize=(12, 8))
for cluster in set(sector_clusters.values()):
    cluster_points = X_pca[[i for i, sec in enumerate(sectors) if sector_clusters[sec] == cluster]]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f"Cluster {cluster}")

plt.title("Segregated Sectors Based on Consistent Clustering")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Sample data for inconsistent sectors with their cluster memberships.
data = {
    'Sectors': [
        'Homebuilding', 'Packaged Foods & Meats', 'Movies & Entertainment', 'Health Care Supplies', 
        'Oil & Gas Equipment & Services', 'Semiconductors', 'Automobile Manufacturers', 'Consumer Finance',
        'Health Care REITs', 'Aerospace & Defense', 'Systems Software', 'Apparel, Accessories & Luxury Goods'
    ],
    'Cluster 1': [0.19, 0.18, 0.20, 0.18, 0.19, 0.19, 0.20, 0.16, 0.27, 0.20, 0.19, 0.19],
    'Cluster 2': [0.11, 0.17, 0.21, 0.11, 0.31, 0.20, 0.19, 0.11, 0.19, 0.39, 0.17, 0.23],
    'Cluster 3': [0.21, 0.22, 0.20, 0.22, 0.27, 0.21, 0.20, 0.22, 0.05, 0.30, 0.22, 0.22],
    'Cluster 4': [0.31, 0.16, 0.18, 0.26, 0.14, 0.19, 0.20, 0.16, 0.28, 0.10, 0.21, 0.17],
    'Cluster 5': [0.18, 0.27, 0.21, 0.24, 0.09, 0.20, 0.20, 0.34, 0.21, 0.01, 0.21, 0.19]
}

# Convert to DataFrame for seaborn heatmap
df = pd.DataFrame(data)
df.set_index('Sectors', inplace=True)

# Create a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df, annot=True, cmap='YlGnBu', linewidths=0.5, fmt=".2f", cbar_kws={'label': 'Membership Strength'})
plt.title("Sector Membership Strengths Across Clusters")
plt.xlabel("Clusters")
plt.ylabel("Sectors")
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Create a DataFrame with the average membership values
membership_df = pd.DataFrame(avg_membership, columns=[f"Cluster {i+1}" for i in range(num_clusters)], index=sectors)

# Step 2: Create the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(membership_df, annot=True, cmap="YlGnBu", cbar=True, fmt=".2f", linewidths=0.5)
plt.title('Fuzzy C-Means Membership Heatmap')
plt.xlabel('Clusters')
plt.ylabel('Sectors')
plt.tight_layout()
plt.show()
