<a href="https://colab.research.google.com/github/walteralzurutt/classificacion-empresas/blob/main/RBICs_PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load environment and libraries

In [1]:
#@title Install basic packages and load environments. Initialize loaders for Snowflake and Google Sheets

!pip install "sphinx==7.2.6" > /dev/null

try:  # are we running on Colab?
  from google.colab import drive
  from google.colab import userdata
  colab = True
except Exception as e:
  colab = False

if colab:
  # install AWS CLI and dotenv
  !pip install "awscli==1.40.9" "python-dotenv==1.1.0" "gspread==6.2.0" > /dev/null

  import dotenv
  from getpass import getpass
  import os
  import gspread
  import base64
  import json
  import pandas as pd
  import numpy as np
  import base64
  from datetime import timedelta
  import sys

  import logging
  logging.basicConfig()
  logging.getLogger('snowflake').setLevel(logging.ERROR)

  import warnings
  warnings.filterwarnings('ignore')

  # mount Google Drive
  drive.mount("/content/gdrive")
  root_dir = "/content/gdrive/"
  project_drive = f"Shareddrives/Clarity AI/05 - Product Research & Innovation/21 - Team Lifecycle/11 - SME Repository/Colab setup"
  home_dir = f"{root_dir}MyDrive/Colab Notebooks/"
  sys.path.append(home_dir)

  # Configure AWS CLI credentials
  aws_dir = f"{os.getenv('HOME')}/.aws"
  os.makedirs(aws_dir, exist_ok=True)

  with open(f"{aws_dir}/credentials", "wt") as file:
    file.write(
  f"""[root]
  aws_access_key_id={userdata.get("AWS_ACCESS_KEY_ID")}
  aws_secret_access_key={userdata.get("AWS_SECRET_ACCESS_KEY")}
  """
    )
  with open(f"{aws_dir}/config", "wt") as file:
    file.write(
  f"""[profile mgmt]
  region=eu-central-1
  source_profile = root
  role_arn = arn:aws:iam::913932804865:role/federateclarity
  output=json
  [profile federate_root]
  region = eu-central-1
  source_profile = root
  role_arn = arn:aws:iam::064436394451:role/federateclarity
  """
    )
  os.environ["AWS_PROFILE"] = "mgmt"
  dotenv.load_dotenv(dotenv_path=f"{home_dir}.env")
  !pip install "boto3==1.38.10" "s3transfer>=0.12.0" "snowflake-connector-python==3.15.0" "snowflake-sqlalchemy==1.7.3" "numpy==2.0.2" "s3fs==0.4.2" "docutils==0.19" "pandas==2.2.2" "s3transfer>=0.12.0" > /dev/null


## SNOWFLAKE ##
# Import dependencies
import snowflake.connector as sc
from cryptography.hazmat.backends import default_backend
from sqlalchemy import create_engine, engine, text
from cryptography.hazmat.primitives import serialization
from sqlalchemy.dialects import registry

# Read certificate
try:
  with open("/content/gdrive/MyDrive/Colab Notebooks/private.pem", "rb") as key:
      private_key = serialization.load_pem_private_key(
          key.read(),
          password=userdata.get('certificatepass').encode(),
          backend=default_backend()
      )
except:
  with open("/content/gdrive/MyDrive/Colab_Notebooks/private.pem", "rb") as key:
      private_key = serialization.load_pem_private_key(
          key.read(),
          password=userdata.get('certificatepass').encode(),
          backend=default_backend()
      )
private_key_bytes = private_key.private_bytes(
    encoding=serialization.Encoding.DER,
    format=serialization.PrivateFormat.PKCS8,
    encryption_algorithm=serialization.NoEncryption()
)

# Connect, create functions
class Snowflake:

  def __init__(self):
      self.engine = engine.create_engine("snowflake://not@used/db", creator=self.snow_connect)

  def snow_connect(self):
    return sc.connect(
            user=os.getenv("SNOWFLAKE_USERNAME"),
            account=os.getenv("SNOWFLAKE_ACCOUNT"),
            private_key=private_key_bytes,
            role=os.getenv("SNOWFLAKE_ROLE"),
            database=os.getenv("SNOWFLAKE_DATABASE"),
            warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
            )

  def read_sql_query(self, query):
    """
    Execute a query in Snowflake

    Args:
        query (str): SQL query to execute

    Returns:
        df (pd.DataFrame): DataFrame with the results of the query
    """
    with self.engine.begin() as connection:
        df = Snowflake().read_sql_query(text(query), con=connection)

    # Make sure all column names are lowercase
    df.columns = df.columns.str.lower()
    return df


## Initialize
snowflake_connector = Snowflake()

Mounted at /content/gdrive
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pygit2 1.19.1 requires cffi>=2.0, but you have cffi 1.17.1 which is incompatible.[0m[31m
[0m

In [2]:
class Snowflake:

    def __init__(self):
        self.engine = engine.create_engine(
            "snowflake://not@used/db",
            creator=self.snow_connect
        )

    def snow_connect(self):
        return sc.connect(
            user=os.getenv("SNOWFLAKE_USERNAME"),
            account=os.getenv("SNOWFLAKE_ACCOUNT"),
            private_key=private_key_bytes,
            role=os.getenv("SNOWFLAKE_ROLE"),
            database=os.getenv("SNOWFLAKE_DATABASE"),
            warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
        )

    def read_sql_query(self, query: str):
        """
        Execute a query in Snowflake and return results as a pandas DataFrame
        """
        with self.engine.begin() as connection:
            df = pd.read_sql_query(text(query), con=connection)

        # Make sure all column names are lowercase
        df.columns = df.columns.str.lower()
        return df

In [3]:
#@title Import libraries and define functions needed to read the  registry service

import requests
import asyncio # only for async requests
import nest_asyncio
from google.colab import userdata
import json
import pandas as pd

def extract_registry_service_data(release_tag,use_release_candidate_tag=False):

  uri_base = 'https://registry-service.mgmt.clarity.ai/v1'

  headers = {"Authorization": "Bearer {}".format(userdata.get('registry_service_bearer_token'))}

  if use_release_candidate_tag:
    uri = uri_base + '/tags/'+ release_candidate

  else:
    uri = uri_base + '/tags/'+ release_tag


  params = {
      'tenant': 'CLA'
  }

  tags = requests.get(uri, params = params, headers = headers)

  tags.json()['tag']

  current_tag = tags.json()['tag']

  uri = uri_base + '/tags/{}'.format(current_tag)

  params = {
  }

  tables = requests.get(uri, params = params, headers = headers)

  tables=tables.json()['datasets']

  registry_service=pd.DataFrame(tables)

  registry_service_data=registry_service[['table_name','dataset_name','location','dag_owner','execution_date','module','database']].copy()

  return registry_service_data


def fetch_table_name(registry_service_data,dataset_name):

  temp=registry_service_data[registry_service_data.table_name.notna()].copy()

  temp=temp[temp.dataset_name == dataset_name]

  temp['execution_date'] = pd.to_datetime(temp['execution_date'])

  latest_date = temp['execution_date'].max()

  temp=temp[temp['execution_date'] == latest_date].table_name.values

  return temp[0]

def fetch_table_location(registry_service_data,dataset_name):

  temp=registry_service_data[registry_service_data.dataset_name == dataset_name].copy()

  temp['execution_date'] = pd.to_datetime(temp['execution_date'])

  latest_date = temp['execution_date'].max()

  temp=temp[temp['execution_date'] == latest_date].location.values

  return temp[0]

# Data Input

In [4]:
#Fetch data paths for a specific release tag

#release_tag='RELEASE-CLA-2025-08.1'
release_tag='RELEASE-CLA-2026-02'

registry_service_data=extract_registry_service_data(release_tag)

cas_path=fetch_table_name(registry_service_data,'cas_rc')

In [5]:
# Reading the entire RBICs dataset for 2023

data=Snowflake().read_sql_query(f"""

select clarity_id, metric, value
from domain_archive.{cas_path}
where metric ilike '%rbics%pct'
and metric_year = 2023

""")

data.value=data.value.astype(float)

# Cleaning up the metric names to keep only the rbics code - Removing L7 information for now

data['metric'] = data['metric'].str[6:].str[:12]

# Summing together rows that have the same L6 but diferent L7 into one row

data=data.groupby(['clarity_id','metric']).sum('value').reset_index()

In [6]:
rbics=data.copy()

for i in range(1, 7):
    rbics[f'l{i}_id'] = rbics['metric'].str[:i*2]

# PCA Tests

In [7]:
# Pivot data

data_wide = data.pivot(index='clarity_id', columns='metric', values='value').fillna(0)

In [8]:
from sklearn.decomposition import PCA

# Set n_components as 0.95 to get enough components to explain 95% EVR
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(data_wide)

print(f"Number of components to retain 95% variance: {pca.n_components_}")

Number of components to retain 95% variance: 1110


In [17]:
pc_index = 5  # PC1
weights = pca.components_[pc_index]

# Map to original metric names
metric_names = data_wide.columns  # original metrics
pc_weights = pd.Series(weights, index=metric_names)

# Sort by absolute value to see strongest contributors
pc_weights_sorted = pc_weights.abs().sort_values(ascending=False)
print(pc_weights_sorted.head(10))  # top 10 contributing metrics for PC1

metric
302515102025    0.664025
351015401010    0.662741
302515151010    0.323337
351520101010    0.086204
302515151020    0.048484
302515102010    0.044134
151010102515    0.027855
351525101010    0.024482
351010152010    0.016862
401525101010    0.016246
dtype: float64


In [12]:
pca.components_

array([[-3.16767308e-04, -1.90150761e-03, -1.98655930e-04, ...,
        -3.67899037e-05, -4.74906686e-04, -8.70325473e-04],
       [-3.51473680e-04, -2.03301539e-03, -1.99397818e-04, ...,
        -3.33586033e-05, -4.58417352e-04, -5.55121224e-04],
       [-3.42301485e-04, -1.59468807e-03, -1.87594289e-04, ...,
        -3.67027584e-05, -5.25184960e-04, -9.61091895e-04],
       ...,
       [-1.04166044e-03, -6.22203455e-04, -3.87607251e-03, ...,
         9.11139322e-05,  7.08882090e-04, -2.28707732e-04],
       [ 3.11329702e-04,  2.05104906e-04,  1.26133242e-03, ...,
        -2.76414175e-05, -1.26606635e-03, -6.05603224e-04],
       [-1.59055501e-04, -9.82662129e-05, -6.57048516e-04, ...,
         1.44252882e-05, -4.10854405e-04, -1.70835075e-04]])

In [10]:
print(pca.explained_variance_ratio_)

[0.02674556 0.01561039 0.01355649 ... 0.00017731 0.00017714 0.00017709]
