In [3]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt


In [None]:
d=pd.read_csv('sector_mkt_cap_results/Aerospace & Defense_mkt_cap_quarter_end.csv')
d

In [None]:

def calculate_yoy_growth(df):
    """Calculate the Year-over-Year (YoY) growth for Market Cap."""
    df['YoY_Growth'] = df['MarketCap'].pct_change(periods=4) * 100  # YoY percentage change
    df.dropna(subset=['YoY_Growth'], inplace=True)  # Drop rows with NaN YoY growth
    return df

def apply_log10_transformation(df):
    """Apply log10 transformation to the YoY growth."""
    df['Log_YoY_Growth'] = np.log10(df['YoY_Growth'] + 100)  # log10(1 + YoY_Growth) to handle negative growth
    return df

def plot_yoy_growth_with_log(df, sector):
    """Plot the YoY growth and log10-transformed YoY growth side by side."""
    unique_tickers = df['Ticker'].unique()

    # Set up subplots
    fig, axs = plt.subplots(1, 2, figsize=(15, 6))

    # Plot 1: Percentage YoY Growth
    for ticker in unique_tickers:
        company_data = df[df['Ticker'] == ticker]
        axs[0].plot(company_data['Date'], company_data['YoY_Growth'], label=ticker)

    axs[0].set_title(f'Percentage YoY Growth for {sector} Sector')
    axs[0].set_xlabel('Date')
    axs[0].set_ylabel('YoY Growth (%)')
    axs[0].legend(loc='best')
    axs[0].grid(True)

    # Plot 2: Log10 YoY Growth
    for ticker in unique_tickers:
        company_data = df[df['Ticker'] == ticker]
        axs[1].plot(company_data['Date'], company_data['Log_YoY_Growth'], label=ticker)

    axs[1].set_title(f'Log10 YoY Growth for {sector} Sector')
    axs[1].set_xlabel('Date')
    axs[1].set_ylabel('Log10 YoY Growth')
    axs[1].legend(loc='best')
    axs[1].grid(True)

    plt.tight_layout()  # Adjust layout so plots don't overlap
    plt.show()

def main():
    # Path to the directory where sector CSV files are saved
    input_dir = "sector_mkt_cap_results"

    # Process each sector CSV
    for sector_file in os.listdir(input_dir):
        if sector_file.endswith(".csv"):
            sector = sector_file.replace("_mkt_cap_quarter_end.csv", "")
            print(f"Processing {sector} sector...")

            # Load the CSV file
            file_path = os.path.join(input_dir, sector_file)
            df = pd.read_csv(file_path, parse_dates=['Date'])

            # Calculate YoY growth and apply log10 transformation
            df = calculate_yoy_growth(df)
            df = apply_log10_transformation(df)

            # Plot the YoY growth in percentage and log10
            plot_yoy_growth_with_log(df, sector)

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

def calculate_yoy_growth(df):
    """Calculate the Year-over-Year (YoY) growth for Market Cap."""
    df['YoY_Growth'] = df['MarketCap'].pct_change(periods=4) * 100  # YoY percentage change
    df.dropna(subset=['YoY_Growth'], inplace=True)  # Drop rows with NaN YoY growth
    return df

def apply_log10_transformation(df):
    """Apply log10 transformation to the YoY growth."""
    df['Log_YoY_Growth'] = np.log10(df['YoY_Growth'] + 100)  # log10(1 + YoY_Growth) to handle negative growth
    return df

def calculate_sector_leader_and_rank(df, sector):
    """Calculate the leader and performance ranking for a sector."""
    # Calculate the average YoY growth of the sector per quarter
    sector_avg = df.groupby('Date')['YoY_Growth'].mean()

    # Count how many times each company outperforms the sector average
    overperformance_counts = {}
    
    for ticker in df['Ticker'].unique():
        company_data = df[df['Ticker'] == ticker]
        company_data = company_data.set_index('Date')
        company_data['Sector_Avg'] = sector_avg
        
        # Count number of times this company outperforms the sector average
        overperformance_count = (company_data['YoY_Growth'] > company_data['Sector_Avg']).sum()
        overperformance_counts[ticker] = overperformance_count

    # Sort the companies based on the number of times they overperformed
    sorted_companies = sorted(overperformance_counts.items(), key=lambda x: x[1], reverse=True)

    # Identify the leader (company with the maximum overperformance count)
    sector_leader = sorted_companies[0][0]  # The first company in the sorted list is the leader

    # Print the leader and the ranking for the sector
    print(f"Leader for {sector} sector: {sector_leader}")
    print(f"Descending order of companies by overperformance in {sector} sector:")
    for company, count in sorted_companies:
        print(f"{company}: {count} times overperformed")
    print("\n" + "="*50 + "\n")

    return sector_leader, sorted_companies

def plot_yoy_growth_with_log(df, sector):
    """Plot the YoY growth and log10-transformed YoY growth side by side."""
    unique_tickers = df['Ticker'].unique()

    # Set up subplots
    fig, axs = plt.subplots(1, 2, figsize=(15, 6))

    # Plot 1: Percentage YoY Growth
    for ticker in unique_tickers:
        company_data = df[df['Ticker'] == ticker]
        axs[0].plot(company_data['Date'], company_data['YoY_Growth'], label=ticker)

    axs[0].set_title(f'Percentage YoY Growth for {sector} Sector')
    axs[0].set_xlabel('Date')
    axs[0].set_ylabel('YoY Growth (%)')
    axs[0].legend(loc='best')
    axs[0].grid(True)

    # Plot 2: Log10 YoY Growth
    for ticker in unique_tickers:
        company_data = df[df['Ticker'] == ticker]
        axs[1].plot(company_data['Date'], company_data['Log_YoY_Growth'], label=ticker)

    axs[1].set_title(f'Log10 YoY Growth for {sector} Sector')
    axs[1].set_xlabel('Date')
    axs[1].set_ylabel('Log10 YoY Growth')
    axs[1].legend(loc='best')
    axs[1].grid(True)

    plt.tight_layout()  # Adjust layout so plots don't overlap
    plt.show()

def main():
    # Path to the directory where sector CSV files are saved
    input_dir = "sector_mkt_cap_results"

    # Process each sector CSV
    for sector_file in os.listdir(input_dir):
        if sector_file.endswith(".csv"):
            sector = sector_file.replace("_mkt_cap_quarter_end.csv", "")
            print(f"Processing {sector} sector...")

            # Load the CSV file
            file_path = os.path.join(input_dir, sector_file)
            df = pd.read_csv(file_path, parse_dates=['Date'])

            # Calculate YoY growth and apply log10 transformation
            df = calculate_yoy_growth(df)
            df = apply_log10_transformation(df)

            # Calculate sector leader and rank
            sector_leader, sorted_companies = calculate_sector_leader_and_rank(df, sector)

            # Optionally, plot the YoY growth in percentage and log10
            plot_yoy_growth_with_log(df, sector)

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

def calculate_yoy_growth(df):
    """Calculate the Year-over-Year (YoY) growth for Market Cap."""
    df['YoY_Growth'] = df['MarketCap'].pct_change(periods=4) * 100  # YoY percentage change
    df.dropna(subset=['YoY_Growth'], inplace=True)  # Drop rows with NaN YoY growth
    return df

def calculate_sector_leader_and_rank(df):
    """Calculate the leader and performance ranking for a sector."""
    sector_avg = df.groupby('Date')['YoY_Growth'].mean()
    overperformance_counts = {}

    for ticker in df['Ticker'].unique():
        company_data = df[df['Ticker'] == ticker].set_index('Date')
        company_data['Sector_Avg'] = sector_avg
        overperformance_count = (company_data['YoY_Growth'] > company_data['Sector_Avg']).sum()
        overperformance_counts[ticker] = overperformance_count

    sorted_companies = sorted(overperformance_counts.items(), key=lambda x: x[1], reverse=True)
    return sorted_companies

def calculate_weighted_average_growth(df, sorted_companies):
    """Calculate the weighted average growth for a sector based on company rankings."""
    # Assign weights to companies based on their ranking
    total_weight = sum(range(1, len(sorted_companies) + 1))  # Total weight sum, e.g., 3+2+1
    company_weights = {company: weight for company, weight in zip([sc[0] for sc in sorted_companies], range(len(sorted_companies), 0, -1))}

    print(f"Company Weights: {company_weights}")  # Debugging: Print weights
    
    # Initialize a DataFrame to store the weighted YoY growth
    weighted_yoy_growth = pd.DataFrame()

    # Process each company
    for ticker in df['Ticker'].unique():
        company_data = df[df['Ticker'] == ticker].copy()

        # Check if the company exists in the company_weights dictionary
        if ticker in company_weights:
            company_data['Weight'] = company_weights[ticker]
        else:
            print(f"Warning: No weight found for {ticker}, skipping.")
            continue

        # Calculate the weighted YoY growth for the company
        company_data['Weighted_YoY_Growth'] = company_data['YoY_Growth'] * company_data['Weight']
        
        # Initialize weighted_yoy_growth DataFrame if it's empty
        if weighted_yoy_growth.empty:
            weighted_yoy_growth = company_data[['Date', 'Weighted_YoY_Growth']].copy()
        else:
            # Merge the company's weighted YoY growth into the existing DataFrame
            weighted_yoy_growth = pd.merge(
                weighted_yoy_growth,
                company_data[['Date', 'Weighted_YoY_Growth']],
                on='Date',
                how='outer',
                suffixes=('', f'_{ticker}')
            )

    # Sum across all companies for each date to get the total weighted growth
    weighted_yoy_growth['Weighted_YoY_Growth_Total'] = weighted_yoy_growth.filter(like='Weighted_YoY_Growth').sum(axis=1)

    # Normalize by the total weight
    weighted_yoy_growth['Final_Weighted_YoY_Growth'] = weighted_yoy_growth['Weighted_YoY_Growth_Total'] / total_weight
    
    # Debugging: print the first few rows to ensure data exists
    print("First few rows of weighted YoY growth (after calculation):")
    print(weighted_yoy_growth.head())
    
    return weighted_yoy_growth[['Date', 'Final_Weighted_YoY_Growth']]



def plot_weighted_index(weighted_yoy_growth, sector, sorted_companies):
    """Plot the weighted YoY growth index for a given sector, along with individual company growth."""
    if weighted_yoy_growth.empty:
        print(f"No data to plot for {sector}.")
        return

    plt.figure(figsize=(12, 8))

    # Plot the sector's weighted index
    plt.plot(weighted_yoy_growth['Date'], weighted_yoy_growth['Final_Weighted_YoY_Growth'], 
             label=f'{sector} Sector Index', color='blue', linewidth=2)

    # Plot each company's YoY growth
    for company, _ in sorted_companies:
        company_col = f'Weighted_YoY_Growth_{company}'
        if company_col in weighted_yoy_growth.  columns:
            plt.plot(weighted_yoy_growth['Date'], weighted_yoy_growth[company_col], 
                     label=f'{company} YoY Growth', linestyle='--', alpha=0.8)

    plt.title(f'Weighted YoY Growth Index and Company Growth for {sector} Sector')
    plt.xlabel('Date')
    plt.ylabel('YoY Growth (%)')
    plt.legend()
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()

    plt.show()



def main():
    # Path to the directory where sector CSV files are saved
    input_dir = "sector_mkt_cap_results"

    # Process each sector CSV
    for sector_file in os.listdir(input_dir):
        if sector_file.endswith(".csv"):
            sector = sector_file.replace("_mkt_cap_quarter_end.csv", "")
            print(f"Processing {sector} sector...")

            # Load the CSV file
            file_path = os.path.join(input_dir, sector_file)
            df = pd.read_csv(file_path, parse_dates=['Date'])

            # Calculate YoY growth
            df = calculate_yoy_growth(df)

            # Calculate sector leader and rank
            sorted_companies = calculate_sector_leader_and_rank(df)

            # Calculate weighted average YoY growth
            weighted_yoy_growth = calculate_weighted_average_growth(df, sorted_companies)

            # Plot the weighted index
            plot_weighted_index(weighted_yoy_growth, sector, sorted_companies)

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

def calculate_yoy_growth(df):
    """Calculate the Year-over-Year (YoY) growth for Market Cap."""
    df['YoY_Growth'] = df['MarketCap'].pct_change(periods=4) * 100  # YoY percentage change
    df.dropna(subset=['YoY_Growth'], inplace=True)  # Drop rows with NaN YoY growth
    return df

def apply_log10_transformation(df):
    """Apply log10 transformation to the YoY growth."""
    df['Log_YoY_Growth'] = np.log10(df['YoY_Growth'] + 100)  # log10(1 + YoY_Growth) to handle negative growth
    return df

def calculate_sector_leader_and_rank(df, sector):
    """Calculate the leader and performance ranking for a sector."""
    sector_avg = df.groupby('Date')['YoY_Growth'].mean()
    overperformance_counts = {}
    
    for ticker in df['Ticker'].unique():
        company_data = df[df['Ticker'] == ticker]
        company_data = company_data.set_index('Date')
        company_data['Sector_Avg'] = sector_avg
        
        overperformance_count = (company_data['YoY_Growth'] > company_data['Sector_Avg']).sum()
        overperformance_counts[ticker] = overperformance_count

    sorted_companies = sorted(overperformance_counts.items(), key=lambda x: x[1], reverse=True)

    sector_leader = sorted_companies[0][0]
    
    total_overperformance = sum([count for _, count in sorted_companies])

    # Print sector leaders and overperformance counts
    print(f"Leader for {sector} sector: {sector_leader}")
    print(f"Descending order of companies by overperformance in {sector} sector:")
    for company, count in sorted_companies:
        print(f"{company}: {count} times overperformed")
    
    return sector_leader, sorted_companies, total_overperformance

def calculate_sector_index(df, sorted_companies, total_overperformance):
    """Calculate the sector index based on fractional contribution of stocks."""
    # Create a dictionary of fractional contributions for each company
    fractional_contribution = {company: count / total_overperformance for company, count in sorted_companies}
    
    # Initialize an empty DataFrame to store the sector index values
    sector_index = pd.DataFrame()
    
    for ticker in df['Ticker'].unique():
        company_data = df[df['Ticker'] == ticker].copy()
        
        # Assign the fractional contribution for the stock
        if ticker in fractional_contribution:
            contribution = fractional_contribution[ticker]
            company_data['Weighted_YoY_Growth'] = company_data['YoY_Growth'] * contribution
        else:
            continue

        # Sum the weighted growth into the sector index
        if sector_index.empty:
            sector_index = company_data[['Date', 'Weighted_YoY_Growth']].copy()
        else:
            sector_index = pd.merge(sector_index, company_data[['Date', 'Weighted_YoY_Growth']],
                                    on='Date', how='outer', suffixes=('', f'_{ticker}'))

    # Sum across all companies for each date
    sector_index['Sector_Index'] = sector_index.filter(like='Weighted_YoY_Growth').sum(axis=1)

    return sector_index[['Date', 'Sector_Index']]

def plot_sector_index(sector_index, sector):
    """Plot the sector index over time."""
    if sector_index.empty:
        print(f"No data to plot for {sector}.")
        return

    plt.figure(figsize=(10, 6))
    plt.plot(sector_index['Date'], sector_index['Sector_Index'], label=f'{sector} Sector Index', color='blue', linewidth=2)
    plt.title(f'Sector Index for {sector} Sector')
    plt.xlabel('Date')
    plt.ylabel('Index Value')
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def main():
    # Path to the directory where sector CSV files are saved
    input_dir = "sector_mkt_cap_results"

    # Process each sector CSV
    for sector_file in os.listdir(input_dir):
        if sector_file.endswith(".csv"):
            sector = sector_file.replace("_mkt_cap_quarter_end.csv", "")
            print(f"Processing {sector} sector...")

            # Load the CSV file
            file_path = os.path.join(input_dir, sector_file)
            df = pd.read_csv(file_path, parse_dates=['Date'])

            # Calculate YoY growth and apply log10 transformation
            df = calculate_yoy_growth(df)
            df = apply_log10_transformation(df)

            # Calculate sector leader, ranking and total overperformance count
            sector_leader, sorted_companies, total_overperformance = calculate_sector_leader_and_rank(df, sector)

            # Calculate the sector index based on fractional contributions
            sector_index = calculate_sector_index(df, sorted_companies, total_overperformance)

            # Plot the sector index
            plot_sector_index(sector_index, sector)

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

def calculate_yoy_growth(df):
    """Calculate the Year-over-Year (YoY) growth for Market Cap."""
    df['YoY_Growth'] = df['MarketCap'].pct_change(periods=4) * 100  # YoY percentage change
    df.dropna(subset=['YoY_Growth'], inplace=True)  # Drop rows with NaN YoY growth
    return df

def apply_log10_transformation(df):
    """Apply log10 transformation to the YoY growth."""
    df['Log_YoY_Growth'] = np.log10(df['YoY_Growth'] + 100)  # log10(1 + YoY_Growth) to handle negative growth
    return df

def calculate_sector_leader_and_rank(df, sector):
    """Calculate the leader and performance ranking for a sector."""
    sector_avg = df.groupby('Date')['YoY_Growth'].mean()
    overperformance_counts = {}
    
    for ticker in df['Ticker'].unique():
        company_data = df[df['Ticker'] == ticker]
        company_data = company_data.set_index('Date')
        company_data['Sector_Avg'] = sector_avg
        
        overperformance_count = (company_data['YoY_Growth'] > company_data['Sector_Avg']).sum()
        overperformance_counts[ticker] = overperformance_count

    sorted_companies = sorted(overperformance_counts.items(), key=lambda x: x[1], reverse=True)

    sector_leader = sorted_companies[0][0]
    
    total_overperformance = sum([count**2 for _, count in sorted_companies])

    print(f"Leader for {sector} sector: {sector_leader}")
    print(f"Descending order of companies by overperformance in {sector} sector:")
    for company, count in sorted_companies:
        print(f"{company}: {count} times overperformed")
    
    return sector_leader, sorted_companies, total_overperformance

def calculate_sector_index(df, sorted_companies, total_overperformance):
    """Calculate the sector index based on fractional contribution of stocks."""
    fractional_contribution = {company: count**2 / total_overperformance for company, count in sorted_companies}
    
    sector_index = pd.DataFrame()
    
    for ticker in df['Ticker'].unique():
        company_data = df[df['Ticker'] == ticker].copy()
        
        if ticker in fractional_contribution:
            contribution = fractional_contribution[ticker]
            company_data['Weighted_YoY_Growth'] = company_data['YoY_Growth'] * contribution
        else:
            continue

        if sector_index.empty:
            sector_index = company_data[['Date', 'Weighted_YoY_Growth']].copy()
        else:
            sector_index = pd.merge(sector_index, company_data[['Date', 'Weighted_YoY_Growth']],
                                    on='Date', how='outer', suffixes=('', f'_{ticker}'))

    sector_index['Sector_Index'] = sector_index.filter(like='Weighted_YoY_Growth').sum(axis=1)

    return sector_index[['Date', 'Sector_Index']]

def calculate_simple_average_index(df):
    """Calculate sector index using simple average of YoY growth."""
    simple_avg_index = df.groupby('Date')['YoY_Growth'].mean().reset_index()
    simple_avg_index.rename(columns={'YoY_Growth': 'Simple_Avg_Index'}, inplace=True)
    return simple_avg_index

def plot_sector_index(sector_index, simple_avg_index, df, sector):
    """Plot the sector index (weighted and simple average) and stock values."""
    if sector_index.empty:
        print(f"No data to plot for {sector}.")
        return

    # Set up the plot
    plt.figure(figsize=(12, 8))

    # Plot contributing stock YoY values
    unique_tickers = df['Ticker'].unique()
    for ticker in unique_tickers:
        company_data = df[df['Ticker'] == ticker]
        plt.plot(company_data['Date'], company_data['YoY_Growth'], label=f"{ticker} YoY Growth", linestyle='--')

    # Plot weighted sector index
    plt.plot(sector_index['Date'], sector_index['Sector_Index'], label=f'{sector} Weighted Sector Index', color='blue', linewidth=2)

    # Plot simple average sector index
    plt.plot(simple_avg_index['Date'], simple_avg_index['Simple_Avg_Index'], label=f'{sector} Simple Average Index', color='red', linewidth=2)

    plt.title(f'Sector Index for {sector} Sector (Weighted vs Simple Average)')
    plt.xlabel('Date')
    plt.ylabel('YoY Growth / Index Value')
    plt.legend(loc='best')
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def main():
    # Path to the directory where sector CSV files are saved
    input_dir = "sector_mkt_cap_results"
    output_dir = "sector_wise_index"  # Directory to save sector indices

    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Process each sector CSV
    for sector_file in os.listdir(input_dir):
        if sector_file.endswith(".csv"):
            sector = sector_file.replace("_mkt_cap_quarter_end.csv", "")
            print(f"Processing {sector} sector...")

            # Load the CSV file
            file_path = os.path.join(input_dir, sector_file)
            df = pd.read_csv(file_path, parse_dates=['Date'])

            # Calculate YoY growth and apply log10 transformation
            df = calculate_yoy_growth(df)
            df = apply_log10_transformation(df)

            # Calculate sector leader, ranking and total overperformance count
            sector_leader, sorted_companies, total_overperformance = calculate_sector_leader_and_rank(df, sector)

            # Calculate the sector index based on fractional contributions
            sector_weighted_index = calculate_sector_index(df, sorted_companies, total_overperformance)

            # Calculate the simple average index
            simple_avg_index = calculate_simple_average_index(df)

            # Merge sector_index and simple_avg_index for saving
            combined_index = pd.merge(sector_weighted_index, simple_avg_index, on='Date', how='outer')

            # Save the sector index data to a CSV file
            output_file_path = os.path.join(output_dir, f"{sector}_sector_index.csv")
            combined_index.to_csv(output_file_path, index=False)
            print(f"Saved sector index for {sector} to {output_file_path}")

            # Plot the sector index (both weighted and simple average) alongside stock values
            plot_sector_index(sector_weighted_index, simple_avg_index, df, sector)

if __name__ == "__main__":
    main()



In [None]:
import yfinance as yf
import pandas as pd

# Fetch data for a specific ticker
ticker = "AAPL"  # Example: Apple Inc.
stock = yf.Ticker(ticker)

# Get quarterly financials (revenue data is included)
quarterly_financials = stock.quarterly_financials.T

# Extract revenue data
revenue_data = quarterly_financials[['Total Revenue']]

print(revenue_data)


In [None]:
import numpy as np

# Convert revenue to numeric values (if necessary)
revenue_data['Total Revenue'] = pd.to_numeric(revenue_data['Total Revenue'], errors='coerce')

# Sort data by date
revenue_data = revenue_data.sort_index()

# Shift revenue by 4 quarters (1 year) to calculate YoY
revenue_data['YoY Growth'] = (revenue_data['Total Revenue'] - revenue_data['Total Revenue'].shift(4)) / revenue_data['Total Revenue'].shift(4) * 100

# Drop rows with NaN values caused by the shift
revenue_data = revenue_data.dropna()

print(revenue_data)


In [None]:
import pandas as pd
import numpy as np
import os

def calculate_yoy_growth(df):
    """Calculate the Year-over-Year (YoY) growth for Market Cap."""
    df['YoY_Growth'] = df['MarketCap'].pct_change(periods=4) * 100  # YoY percentage change
    df.dropna(subset=['YoY_Growth'], inplace=True)  # Drop rows with NaN YoY growth
    return df

def calculate_sector_index_variations(df, sorted_companies, total_overperformance):
    """Calculate sector index and return variations between weighted and simple average index."""
    fractional_contribution = {company: count**2 / total_overperformance for company, count in sorted_companies}
    
    sector_index = pd.DataFrame()

    for ticker in df['Ticker'].unique():
        company_data = df[df['Ticker'] == ticker].copy()

        if ticker in fractional_contribution:
            contribution = fractional_contribution[ticker]
            company_data['Weighted_YoY_Growth'] = company_data['YoY_Growth'] * contribution
        else:
            continue

        if sector_index.empty:
            sector_index = company_data[['Date', 'Weighted_YoY_Growth']].copy()
        else:
            sector_index = pd.merge(sector_index, company_data[['Date', 'Weighted_YoY_Growth']],
                                    on='Date', how='outer', suffixes=('', f'_{ticker}'))

    sector_index['Weighted_Index'] = sector_index.filter(like='Weighted_YoY_Growth').sum(axis=1)

    # Calculate simple average
    simple_avg_index = df.groupby('Date')['YoY_Growth'].mean().reset_index()
    sector_index = pd.merge(sector_index, simple_avg_index, on='Date', how='left')
    sector_index.rename(columns={'YoY_Growth': 'Simple_Avg_Index'}, inplace=True)

    # Calculate the difference between the weighted and simple averages
    sector_index['Difference'] = (sector_index['Weighted_Index'] - sector_index['Simple_Avg_Index']).abs()

    return sector_index[['Date', 'Weighted_Index', 'Simple_Avg_Index', 'Difference']]

def calculate_variance_and_positive_ratio(df):
    """Calculate variance of YoY growth and the positive-to-negative growth ratio."""
    variance = df['YoY_Growth'].var()

    positive_growth_count = (df['YoY_Growth'] > 0).sum()
    negative_growth_count = (df['YoY_Growth'] < 0).sum()

    if negative_growth_count > 0:
        pos_to_neg_ratio = positive_growth_count / negative_growth_count
    else:
        pos_to_neg_ratio = float('inf')  # All positive growth

    return variance, pos_to_neg_ratio

def get_sector_rankings(input_dir, output_file='sector_rankings.csv'):
    rankings_variation = {}
    rankings_variance = {}
    rankings_positive_negative_ratio = {}

    for sector_file in os.listdir(input_dir):
        if sector_file.endswith(".csv"):
            sector = sector_file.replace("_mkt_cap_quarter_end.csv", "")
            print(f"Processing {sector} sector...")

            # Load the CSV file and filter data for 2019-2024
            file_path = os.path.join(input_dir, sector_file)
            df = pd.read_csv(file_path, parse_dates=['Date'])
            
            # Convert 'Date' to UTC
            df['Date'] = pd.to_datetime(df['Date'], utc=True)
            
            # Filter the data from 2019 onwards
            df = df[df['Date'] >= pd.Timestamp('2019-01-01', tz='UTC')]

            # Calculate YoY growth
            df = calculate_yoy_growth(df)

            # Calculate sector leader and fractional contributions
            sector_leader, sorted_companies, total_overperformance = calculate_sector_leader_and_rank(df, sector)

            # Calculate sector index variations
            sector_index_variations = calculate_sector_index_variations(df, sorted_companies, total_overperformance)

            # Calculate variance of YoY growth and positive/negative growth ratio
            variance, pos_to_neg_ratio = calculate_variance_and_positive_ratio(df)

            # Summarize the variation between weighted and simple averages
            avg_difference = sector_index_variations['Difference'].mean()

            # Save results for rankings
            rankings_variation[sector] = avg_difference
            rankings_variance[sector] = variance
            # rankings_positive_negative_ratio[sector] = pos_to_neg_ratio

    # Create a DataFrame from the rankings
    rankings_df = pd.DataFrame({
        'Sector': list(rankings_variation.keys()),
        'Variation (Weighted vs Simple Avg)': list(rankings_variation.values()),
        'Variance of YoY Growth': list(rankings_variance.values()),
        # 'Positive to Negative Growth Ratio': list(rankings_positive_negative_ratio.values())
    })

    # Save the rankings DataFrame to CSV
    rankings_df.to_csv(output_file, index=False)
    print(f"\nRankings saved to {output_file}")

    # Optionally, print the rankings (as before)
    print("\nRanking of sectors based on variation between simple and weighted averages:")
    for sector, value in sorted(rankings_variation.items(), key=lambda x: x[1], reverse=True):
        print(f"{sector}: {value}")

    print("\nRanking of sectors based on variance of YoY growth over time:")
    for sector, value in sorted(rankings_variance.items(), key=lambda x: x[1], reverse=True):
        print(f"{sector}: {value}")

    print("\nRanking of sectors based on ratio of positive to negative growth over time:")
    for sector, value in sorted(rankings_positive_negative_ratio.items(), key=lambda x: x[1], reverse=True):
        print(f"{sector}: {value}")
        
    rankings_df=rankings_df

def main():
    input_dir = "sector_mkt_cap_results"
    get_sector_rankings(input_dir)

if __name__ == "__main__":
    main()


In [None]:
rankings_df=pd.read_csv('sector_rankings.csv')
rankings_df

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

# Step 1: Read the CSV file
df = pd.read_csv('sector_rankings.csv')

# Step 2: Normalize each column independently (excluding the 'Sector' column)
scaler = StandardScaler()
X = df.drop('Sector', axis=1)  # Keep only numerical data for normalization
X_scaled = X.apply(lambda col: scaler.fit_transform(col.values.reshape(-1, 1)).flatten(), axis=0)

# Step 3: Perform hierarchical clustering
Z = linkage(X_scaled, method='ward')

# Step 4: Plot the dendrogram
plt.figure(figsize=(12, 20))
dendrogram(Z, labels=df['Sector'].values, leaf_rotation=90, leaf_font_size=10)
plt.title('Hierarchical Clustering of Sectors')
plt.xlabel('Sectors')
plt.ylabel('Euclidean Distance')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Step 1: Read the CSV file
df = pd.read_csv('sector_rankings.csv')

# Step 2: Normalize the data (excluding the 'Sector' column)
scaler = StandardScaler()
X = df.drop('Sector', axis=1)  # Keep only numerical data for normalization
X_scaled = scaler.fit_transform(X)

# Step 3: Calculate inertia for different values of k
inertia = []
k_values = range(1, 11)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)  # Sum of squared distances to closest cluster center

# Step 4: Plot the Elbow Method graph
plt.figure(figsize=(8, 6))
plt.plot(k_values, inertia, 'bo-', markersize=8)
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Read the CSV file
df = pd.read_csv('sector_rankings.csv')

# Step 2: Normalize the data (excluding the 'Sector' column)
scaler = StandardScaler()
X = df.drop('Sector', axis=1)  # Keep only numerical data for normalization
X_scaled = scaler.fit_transform(X)

# Step 3: Perform K-means clustering with k=7
kmeans = KMeans(n_clusters=4, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)  # Assign clusters to sectors

# Step 4: Show the resulting clusters
print(df[['Sector', 'Cluster']])

# Step 5: Visualize the clusters using a scatter plot (based on the first two principal components for visualization)
from sklearn.decomposition import PCA

pca = PCA(n_components=2)  # Reduce to 2D for visualization
X_pca = pca.fit_transform(X_scaled)

# Create a scatter plot with clusters
plt.figure(figsize=(10, 8))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df['Cluster'], palette='Set1', s=100, legend='full')
plt.title('K-means Clustering of Sectors (k=7)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Step 1: Read the CSV file
df = pd.read_csv('sector_rankings.csv')

# Step 2: Normalize the data (excluding the 'Sector' column)
scaler = StandardScaler()
X = df.drop('Sector', axis=1)  # Keep only numerical data for normalization
X_scaled = scaler.fit_transform(X)

# Step 3: Perform K-means clustering with k=7
kmeans = KMeans(n_clusters=7, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)  # Assign clusters to sectors

# Step 4: Sort the dataframe by the 'Cluster' column
df_sorted = df.sort_values(by='Cluster')

# Step 5: Save the sorted dataframe to a new CSV file
df_sorted.to_csv('sector_clusters_sorted.csv', index=False)

# Step 6: Display a message indicating successful saving
print("Cluster details saved to 'sector_clusters_sorted.csv'.")