In [None]:
import boto3
import re
from tqdm import tqdm
from collections import defaultdict

In [None]:
class Credentials:
    def __init__(self, region_name, profile_name, aws_access_key_id, aws_secret_access_key, aws_session_token):
        self.region_name = region_name
        self.profile_name = profile_name
        self.aws_access_key_id = aws_access_key_id
        self.aws_secret_access_key = aws_secret_access_key
        self.aws_session_token = aws_session_token
        
    @classmethod
    def from_file(cls, path):
        config = configparser.ConfigParser()
        config.read(path)

        profile_name = config.sections()[0]
        aws_access_key_id = config.get(profile_name, 'aws_access_key_id')
        aws_secret_access_key = config.get(profile_name, 'aws_secret_access_key')
        aws_session_token = config.get(profile_name, 'aws_session_token')

        return cls('us-east-1', profile_name, aws_access_key_id, aws_secret_access_key, aws_session_token)
    
    def to_yaml(self, path):
        data = {
            'region_name': self.region_name,
            'profile_name': self.profile_name,
            'aws_access_key_id': self.aws_access_key_id,
            'aws_secret_access_key': self.aws_secret_access_key,
            'aws_session_token': self.aws_session_token
        }

        with open(path, 'w') as outfile:
            yaml.dump(data, outfile)
    
    @classmethod
    def from_yaml(cls, path):
        with open(path, 'r') as f:
            data = yaml.safe_load(f)

        return cls(data['region_name'], data['profile_name'], data['aws_access_key_id'], data['aws_secret_access_key'], data['aws_session_token'])
    


In [None]:
cred = Credentials.from_yaml(CRED_YAML)

In [None]:
cred

In [None]:
CLIENT = boto3.client('athena', **credentials)
GLUE_CLIENT = boto3.client('glue', 'us-east-1')
DB = 'RD_EXT_A_RAIABD'
CATALOG = 'AwsDataCatalog'

In [None]:
boto3.session.Session.client(
    self,
    service_name,
    region_name=None,
    api_version=None,
    use_ssl=True,
    verify=None,
    endpoint_url=None,
    aws_access_key_id=None,
    aws_secret_access_key=None,
    aws_session_token=None,
    config=None,

In [None]:
class adc(Credentials):
    def __init__(self, service='athena', region_name = None, aws_access_key_id = None,
                 aws_secret_access_key = None, aws_session_token = None, **kwargs):
        self.region_name = region_name 
        self.aws_access_key_id = aws_access_key_id
        self.aws_secret_access_key = aws_secret_access_key
        self.aws_session_token = aws_session_token
        self.client = boto3.client(service, region_name = self.region_name,
                                   aws_access_key_id = self.aws_access_key_id,
                 aws_secret_access_key = self.aws_secret_access_key,
                 aws_session_token = self.aws_session_token)
    
           
    @classmethod
    def from_yaml(cls, path, service='athena'):
        with open(path, 'r') as f:
            data = yaml.safe_load(f)

        return cls(service, data['region_name'], data['aws_access_key_id'], 
                   data['aws_secret_access_key'], data['aws_session_token'])
 
    def get_databases(self, CatalogName = 'AwsDataCatalog'):
        response_lsdb = self.client.list_databases(
            CatalogName = CatalogName    
        )
        databases = [i['Name'] for i in response_lsdb['DatabaseList']]
        return databases
        
    def get_tables(self, CatalogName = 'AwsDataCatalog', DatabaseName = None):
        response_tables = self.client.list_table_metadata(
            CatalogName = CatalogName,
            DatabaseName = DatabaseName
        )
        tables = [i['Name'] for i in response_tables['TableMetadataList']]
        return tables
    
    def get_at_columns(self, DatabaseName = None, Name = None):
        response_cols = self.client.get_table(
            DatabaseName = DatabaseName,
            Name = Name
        )
        cols = [i['Name'] for i in response_cols['Table']['StorageDescriptor']['Columns']]
        return cols
   

In [None]:
class adc:
    def __init__(self, region_name=None, aws_access_key_id=None,
                 aws_secret_access_key=None, aws_session_token=None, CatalogName = 'AwsDataCatalog', DataBaseName = None, **kwargs):
        self.region_name = region_name
        self.aws_access_key_id = aws_access_key_id
        self.aws_secret_access_key = aws_secret_access_key
        self.aws_session_token = aws_session_token
        self.CatalogName = CatalogName
        self.DataBaseName = DataBaseName

        self.client = boto3.client(
            'athena',
            region_name=self.region_name,
            aws_access_key_id=self.aws_access_key_id,
            aws_secret_access_key=self.aws_secret_access_key,
            aws_session_token=self.aws_session_token,
            **kwargs
        )
        self.glue_client = boto3.client(
            'glue',
            region_name=self.region_name,
            aws_access_key_id=self.aws_access_key_id,
            aws_secret_access_key=self.aws_secret_access_key,
            aws_session_token=self.aws_session_token,
            **kwargs
        )
    @classmethod
    def from_yaml(cls, path, **kwargs):
        with open(path, 'r') as f:
            data = yaml.safe_load(f)

        return cls(
            region_name=data.get('region_name'),
            aws_access_key_id=data.get('aws_access_key_id'),
            aws_secret_access_key=data.get('aws_secret_access_key'),
            aws_session_token=data.get('aws_session_token'),
            **kwargs
        )

    def get_databases(self, CatalogName = None, **kwargs):
        if CatalogName is None:
            CatalogName = self.CatalogName
        response_lsdb = self.client.list_databases(CatalogName=CatalogName, **kwargs)
        databases = [i['Name'] for i in response_lsdb['DatabaseList']]
        return databases

    def get_tables(self, CatalogName = 'AwsDataCatalog', DatabaseName=None, **kwargs):
        if CatalogName is None:
            CatalogName = self.CatalogName
        if DatabaseName is None:
            DatabaseName = self.DataBaseName
        response_tables = self.client.list_table_metadata(CatalogName=CatalogName, DatabaseName=DatabaseName, **kwargs)
        tables = [i['Name'] for i in response_tables['TableMetadataList']]
        return tables

    def get_at_columns(self, DatabaseName=None, Name=None, **kwargs):
        if DatabaseName is None:
            DatabaseName = self.DataBaseName
        response_cols = self.glue_client.get_table(DatabaseName=DatabaseName, Name=Name, **kwargs)
        cols = [i['Name'] for i in response_cols['Table']['StorageDescriptor']['Columns']]
        return cols

In [None]:
 
class Search:
    def __init__(self, glue_client, databases):
        self.glue_client = glue_client
        self.databases = databases
        
    def isin(word, string):
        if word.lower() in string.lower():
            return True
        else:
            return False

    def search_col(self, list_cols):
        db_found = defaultdict(dict)
        for db in tqdm(self.databases):    
            for table in AwsDataCatalog.get_tables(DatabaseName = db):
                n_found_cols = 0
                found_cols = []
                for col_db in AwsDataCatalog.get_at_columns(DatabaseName=db, Name = table):           
                    for wanted_col in list_cols:
                        if isin(wanted_col, col_db):
                            found_cols.append(col_db)
                            n_found_cols+=1
                if n_found_cols>0:
                    db_found[db][table] = (n_found_cols, found_cols)
        return db_found

    def search_table(self, list_tables):
        db_found = defaultdict(dict)
        for db in tqdm(self.databases):    
            n_found_tables = 0
            found_tables = []
            for table in AwsDataCatalog.get_tables(DatabaseName = db):
                for wanted_table in list_tables:
                    if isin(wanted_table, table):
                        found_tables.append(table)
                        n_found_tables+=1
            if n_found_tables>0:
                db_found[db] = (n_found_tables, found_tables)
        return db_found

***

In [None]:
ad = adc.from_yaml(CRED_YAML, DataBaseName='a_raiabd_prod')

In [None]:
ad.get_databases()
ad.get_tables('AwsDataCatalog')
ad.get_at_columns(Name ='tb_produto')