In [None]:
import requests
import pandas as pd
import os
from datetime import datetime, timedelta

# --- CONFIGURATION ---
PROMETHEUS_URL = 'http://10.35.29.108:30900/api/v1/query_range'
PROMETHEUS_QUERY_URL = 'http://10.35.29.108:30900/api/v1/query' 

save_path = os.path.expanduser('~/Desktop')
CSV_FILENAME = os.path.join(save_path, 'kubernetes_thesis_dataset_final_batch2.csv')

# ‡∏ä‡πà‡∏ß‡∏á‡πÄ‡∏ß‡∏•‡∏≤ (‡∏¢‡πâ‡∏≠‡∏ô‡∏´‡∏•‡∏±‡∏á 1 ‡∏ß‡∏±‡∏ô)
END_TIME = datetime.now()
START_TIME = END_TIME - timedelta(days=3)
STEP = '1m'

# --- 1. MAPPING STEP: Instance IP -> Node Name ---
def get_node_mapping():
    print("üó∫Ô∏è Building Instance-to-Node Map...")
    try:
        # kube_node_info ‡∏°‡∏µ‡∏ó‡∏±‡πâ‡∏á label 'node' ‡πÅ‡∏•‡∏∞ 'instance'
        response = requests.get(PROMETHEUS_QUERY_URL, params={'query': 'kube_node_info'})
        data = response.json()
        mapping = {}
        if data['status'] == 'success':
            for result in data['data']['result']:
                metric = result['metric']
                if 'node' in metric and 'instance' in metric:
                    # Map instance (IP:Port) -> node name
                    # ‡∏ö‡∏≤‡∏á‡∏ó‡∏µ instance ‡∏°‡∏µ port ‡∏ï‡∏¥‡∏î‡∏°‡∏≤ ‡∏ï‡πâ‡∏≠‡∏á‡∏£‡∏∞‡∏ß‡∏±‡∏á
                    mapping[metric['instance']] = metric['node']
        print(f"   ‚úÖ Found mappings: {mapping}")
        return mapping
    except Exception as e:
        print(f"   ‚ö†Ô∏è Mapping failed: {e}")
        return {}

# --- METRICS DEFINITION ---
QUERIES = {
    # -- Usage (‡πÉ‡∏ä‡πâ‡πÇ‡∏î‡∏¢ instance) --
    # Mapping ‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡∏ä‡∏∑‡πà‡∏≠ column ‡∏ó‡∏µ‡∏´‡∏•‡∏±‡∏á
    'node_cpu_usage': {
        'query': 'sum(rate(container_cpu_usage_seconds_total[1m])) by (instance)',
        'key_type': 'instance' 
    },
    'node_mem_usage': {
        'query': 'sum(container_memory_working_set_bytes) by (instance)',
        'key_type': 'instance'
    },
    
    # -- Requests/Capacity (‡πÉ‡∏ä‡πâ‡πÇ‡∏î‡∏¢ node) --
    'node_cpu_req': {
        'query': 'sum(kube_pod_container_resource_requests{resource="cpu"}) by (node)',
        'key_type': 'node'
    },
    'node_mem_req': {
        'query': 'sum(kube_pod_container_resource_requests{resource="memory"}) by (node)',
        'key_type': 'node'
    },
    'node_cpu_cap': {
        'query': 'sum(kube_node_status_allocatable{resource="cpu"}) by (node)',
        'key_type': 'node'
    },
    'node_mem_cap': {
        'query': 'sum(kube_node_status_allocatable{resource="memory"}) by (node)',
        'key_type': 'node'
    },
    
    # -- Cluster Wide --
    'cluster_pods_pending': {
        'query': 'sum(kube_pod_status_phase{phase="Pending"})',
        'key_type': 'cluster'
    }
}

def fetch_metric(name, config, node_map):
    query = config['query']
    key_type = config['key_type']
    
    print(f"Fetching {name}...")
    params = {
        'query': query,
        'start': START_TIME.timestamp(),
        'end': END_TIME.timestamp(),
        'step': STEP
    }
    
    try:
        response = requests.get(PROMETHEUS_URL, params=params)
        data = response.json()
        
        if data['status'] == 'success' and data['data']['result']:
            frames = []
            for result in data['data']['result']:
                # ‡∏´‡∏≤‡∏ä‡∏∑‡πà‡∏≠ Key (Node name)
                metric = result['metric']
                raw_key = metric.get(key_type) or 'unknown'
                
                # ‡∏ñ‡πâ‡∏≤‡πÄ‡∏õ‡πá‡∏ô instance ‡πÉ‡∏´‡πâ‡πÅ‡∏õ‡∏•‡∏á‡πÄ‡∏õ‡πá‡∏ô node name
                if key_type == 'instance':
                    # ‡∏•‡∏≠‡∏á‡∏´‡∏≤‡πÉ‡∏ô Map ‡∏ï‡∏£‡∏á‡πÜ
                    node_name = node_map.get(raw_key)
                    # ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°‡πà‡πÄ‡∏à‡∏≠ ‡∏•‡∏≠‡∏á‡∏ï‡∏±‡∏î Port ‡∏≠‡∏≠‡∏Å (‡πÄ‡∏ä‡πà‡∏ô 10.0.0.1:10250 -> 10.0.0.1)
                    if not node_name and ':' in raw_key:
                        base_ip = raw_key.split(':')[0]
                        # ‡∏•‡∏≠‡∏á‡∏Ñ‡πâ‡∏ô‡πÉ‡∏ô Map ‡∏≠‡∏µ‡∏Å‡∏ó‡∏µ (‡πÄ‡∏ú‡∏∑‡πà‡∏≠ Map ‡πÄ‡∏Å‡πá‡∏ö‡πÅ‡∏ï‡πà IP)
                        # (‡∏™‡πà‡∏ß‡∏ô‡∏ô‡∏µ‡πâ‡∏ó‡∏≥‡πÄ‡∏ú‡∏∑‡πà‡∏≠‡πÑ‡∏ß‡πâ simple match)
                        for k, v in node_map.items():
                            if base_ip in k:
                                node_name = v
                                break
                    
                    final_key = node_name if node_name else raw_key # ‡∏ñ‡πâ‡∏≤‡∏´‡∏≤‡πÑ‡∏°‡πà‡πÄ‡∏à‡∏≠‡∏à‡∏£‡∏¥‡∏á‡πÜ ‡πÉ‡∏ä‡πâ‡∏Ñ‡πà‡∏≤‡πÄ‡∏î‡∏¥‡∏°
                elif key_type == 'cluster':
                    final_key = 'total'
                else:
                    final_key = raw_key

                # ‡∏™‡∏£‡πâ‡∏≤‡∏á DataFrame
                values = result['values']
                df = pd.DataFrame(values, columns=['timestamp', 'value'])
                df['timestamp'] = pd.to_numeric(df['timestamp'])
                df['value'] = pd.to_numeric(df['value'])
                df = df.set_index('timestamp')
                
                # ‡∏ï‡∏±‡πâ‡∏á‡∏ä‡∏∑‡πà‡∏≠ Column: metric_nodeName
                col_name = f"{name}_{final_key}" if final_key != 'total' else name
                df = df.rename(columns={'value': col_name})
                frames.append(df)
            
            if frames:
                return pd.concat(frames, axis=1)
                
        return None
    except Exception as e:
        print(f"Error {name}: {e}")
        return None

def main():
    print(f"--- Starting Final Data Extraction ---")
    
    # 1. Build Map
    node_map = get_node_mapping()
    
    final_df = None
    
    # 2. Fetch All Metrics
    for name, config in QUERIES.items():
        df = fetch_metric(name, config, node_map)
        if df is not None:
            # Join
            df = df.loc[~df.index.duplicated(keep='first')] # ‡∏Å‡∏±‡∏ô‡πÄ‡∏´‡∏ô‡∏µ‡∏¢‡∏ß
            if final_df is None:
                final_df = df
            else:
                final_df = final_df.join(df, how='outer')
    
    # 3. Save
    if final_df is not None:
        final_df = final_df.fillna(0)
        final_df.index = pd.to_datetime(final_df.index, unit='s')
        final_df.to_csv(CSV_FILENAME)
        print(f"\n‚úÖ Success! Data saved to {CSV_FILENAME}")
        print(f"Rows: {len(final_df)}")
        print(f"Columns: {final_df.columns.tolist()}")
        print(final_df.head())
    else:
        print("\n‚ùå Failed.")

if __name__ == "__main__":
    main()