# Service Filter Notebook

This notebook processes the `extracted_services_original.ndjson` file to remove services listed in `services_to_remove.ndjson` based on exact matching of all three fields: `provider`, `service_name`, and `service_alias`.

The output will be saved as `extracted_services.ndjson` in the config directory.

## 1. Import Required Libraries

Import necessary libraries for data processing and file handling.

In [17]:
import json
import os
from typing import List, Dict, Set
import pandas as pd

## 2. Load NDJSON Data

Load both the original extracted services and the services to remove from their respective NDJSON files.

In [18]:
def load_ndjson(file_path: str) -> List[Dict]:
    """Load data from an NDJSON file."""
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line_num, line in enumerate(file, 1):
            line = line.strip()
            if line:  # Skip empty lines
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"Error parsing line {line_num}: {e}")
    return data

# Load the original extracted services
original_services_path = '../config/extracted_services_original.ndjson'
original_services = load_ndjson(original_services_path)

# Load the services to remove
services_to_remove_path = '../config/services_to_remove.ndjson'
services_to_remove = load_ndjson(services_to_remove_path)

print(f"Loaded {len(original_services)} original services")
print(f"Loaded {len(services_to_remove)} services to remove")

Loaded 623 original services
Loaded 34 services to remove


## 3. Data Exploration and Structure Analysis

Examine the structure of the loaded data and display sample records.

In [19]:
# Display sample original services
print("Sample original services:")
for i, service in enumerate(original_services[:3]):
    print(f"{i+1}. {service}")

print("\n" + "="*60 + "\n")

# Display sample services to remove
print("Sample services to remove:")
for i, service in enumerate(services_to_remove[:3]):
    print(f"{i+1}. {service}")

print("\n" + "="*60 + "\n")

# Analyze provider distribution in original services
df_original = pd.DataFrame(original_services)
provider_counts = df_original['provider'].value_counts()
print("Provider distribution in original services:")
print(provider_counts)

print("\n" + "="*60 + "\n")

# Analyze provider distribution in services to remove
df_remove = pd.DataFrame(services_to_remove)
provider_counts_remove = df_remove['provider'].value_counts()
print("Provider distribution in services to remove:")
print(provider_counts_remove)

Sample original services:
1. {'provider': 'AWS', 'service_name': 'ComputeSavingsPlans', 'service_alias': 'Savings Plans for AWS Compute usage'}
2. {'provider': 'AWS', 'service_name': 'Registrar', 'service_alias': 'Amazon Registrar'}
3. {'provider': 'AWS', 'service_name': 'AwsCloudShell', 'service_alias': 'AWS CloudShell'}


Sample services to remove:
1. {'provider': 'AWS', 'service_name': 'IngestionService', 'service_alias': 'AWS Import/Export', 'reason': 'Physical data transfer service - requires shipping physical devices, not cloud-native'}
2. {'provider': 'AWS', 'service_name': 'Registrar', 'service_alias': 'Amazon Registrar', 'reason': 'Domain registration service - administrative function not related to cloud compute/storage/networking'}
3. {'provider': 'AWS', 'service_name': 'ContactCenterTelecommKR', 'service_alias': 'Contact Center Telecommunications Korea', 'reason': 'Regional telecom service - infrastructure/connectivity service, not a cloud computing service'}


Provider dis

## 4. Define Matching Criteria Functions

Create functions to match services based on all three fields: provider, service_name, and service_alias.

In [20]:
def create_service_key(service: Dict) -> tuple:
    """Create a unique key for a service based on provider, service_name, and service_alias."""
    return (
        service.get('provider', ''),
        service.get('service_name', ''),
        service.get('service_alias', '')
    )

def create_removal_set(services_to_remove: List[Dict]) -> Set[tuple]:
    """Create a set of service keys to remove for fast lookup."""
    removal_keys = set()
    for service in services_to_remove:
        key = create_service_key(service)
        removal_keys.add(key)
    return removal_keys

# Create the set of services to remove
removal_keys = create_removal_set(services_to_remove)

print(f"Created removal set with {len(removal_keys)} unique service keys")

# Display a few examples of removal keys
print("\nSample removal keys:")
for i, key in enumerate(list(removal_keys)[:5]):
    print(f"{i+1}. {key}")

Created removal set with 34 unique service keys

Sample removal keys:
1. ('GCP', 'Cloud TPU', 'Cloud TPU')
2. ('GCP', 'Assured Workloads', 'Assured Workloads')
3. ('AWS', 'AWSMDC', 'AWS Modular Data Center')
4. ('GCP', 'BeyondCorp Enterprise', 'BeyondCorp Enterprise')
5. ('GCP', 'Navigation API', 'Navigation API')


## 5. Process Service Records

Filter the original services by removing those that match exactly with the services in the removal list.

In [21]:
# Analyze duplicate entries in services_to_remove
print("DUPLICATE ANALYSIS")
print("="*60)

# Check for duplicates in the removal list
removal_keys_list = [create_service_key(service) for service in services_to_remove]
removal_keys_unique = set(removal_keys_list)

print(f"Total entries in services_to_remove.ndjson: {len(services_to_remove)}")
print(f"Unique service keys in removal list: {len(removal_keys_unique)}")
print(f"Number of duplicates: {len(services_to_remove) - len(removal_keys_unique)}")

if len(services_to_remove) != len(removal_keys_unique):
    print("\nFinding duplicate entries...")
    
    # Count occurrences of each service key
    from collections import Counter
    key_counts = Counter(removal_keys_list)
    
    # Find duplicates
    duplicates = {key: count for key, count in key_counts.items() if count > 1}
    
    print(f"Found {len(duplicates)} services with duplicates:")
    for i, (key, count) in enumerate(duplicates.items(), 1):
        provider, service_name, service_alias = key
        print(f"{i}. Provider: {provider}")
        print(f"   Service: {service_name}")
        print(f"   Appears {count} times in removal list")
        print()

print("="*60 + "\n")

DUPLICATE ANALYSIS
Total entries in services_to_remove.ndjson: 34
Unique service keys in removal list: 34
Number of duplicates: 0



In [22]:
def filter_services(original_services: List[Dict], removal_keys: Set[tuple]) -> List[Dict]:
    """Filter services by removing those that match the removal keys."""
    filtered_services = []
    removed_services = []
    
    for service in original_services:
        service_key = create_service_key(service)
        
        if service_key in removal_keys:
            removed_services.append(service)
        else:
            filtered_services.append(service)
    
    return filtered_services, removed_services

# Filter the services
filtered_services, removed_services = filter_services(original_services, removal_keys)

print(f"Original services: {len(original_services)}")
print(f"Services to remove: {len(services_to_remove)}")
print(f"Actually removed: {len(removed_services)}")
print(f"Remaining services: {len(filtered_services)}")

print("\n" + "="*60 + "\n")

Original services: 623
Services to remove: 34
Actually removed: 34
Remaining services: 589




In [23]:
# Show examples of removed services
if removed_services:
    print("Examples of removed services:")
    for i, service in enumerate(removed_services[:5]):
        print(f"{i+1}. Provider: {service['provider']}, Service: {service['service_name']}")
else:
    print("No services were removed (no exact matches found)")

print("\n" + "="*60 + "\n")

# Find services in removal list that won't actually be removed
def find_unmatched_removals(services_to_remove: List[Dict], removed_services: List[Dict]) -> List[Dict]:
    """Find services that are in the removal list but weren't actually removed."""
    # Create sets of service keys for comparison
    removal_keys_set = set(create_service_key(service) for service in services_to_remove)
    actually_removed_keys = set(create_service_key(service) for service in removed_services)
    
    # Find keys that are in removal list but not in actually removed
    unmatched_keys = removal_keys_set - actually_removed_keys
    
    # Get the full service records for unmatched keys
    unmatched_services = []
    for service in services_to_remove:
        service_key = create_service_key(service)
        if service_key in unmatched_keys:
            unmatched_services.append(service)
    
    return unmatched_services

# Find services in removal list that won't be removed
unmatched_removals = find_unmatched_removals(services_to_remove, removed_services)

print(f"Services in removal list that won't be removed: {len(unmatched_removals)}")

if unmatched_removals:
    print("\nThese services are in the removal list but have no exact match in the original data:")
    for i, service in enumerate(unmatched_removals):
        reason = service.get('reason', 'No reason provided')
        print(f"{i+1}. Provider: {service['provider']}")
        print(f"    Service: {service['service_name']}")
        print(f"    Alias: {service['service_alias']}")
        print(f"    Reason: {reason}")
        print()
    
    # Group unmatched by provider for analysis
    df_unmatched = pd.DataFrame(unmatched_removals)
    unmatched_by_provider = df_unmatched['provider'].value_counts()
    print("Unmatched removals by provider:")
    print(unmatched_by_provider)
else:
    print("All services in the removal list had exact matches and were removed!")

print("\n" + "="*60 + "\n")

# Show provider distribution after filtering
df_filtered = pd.DataFrame(filtered_services)
provider_counts_filtered = df_filtered['provider'].value_counts()
print("Provider distribution after filtering:")
print(provider_counts_filtered)

Examples of removed services:
1. Provider: AWS, Service: ComputeSavingsPlans
2. Provider: AWS, Service: Registrar
3. Provider: AWS, Service: AWSMDC
4. Provider: AWS, Service: AWSDataTransfer
5. Provider: AWS, Service: IngestionService


Services in removal list that won't be removed: 0
All services in the removal list had exact matches and were removed!


Provider distribution after filtering:
provider
AWS      223
Azure    188
GCP      178
Name: count, dtype: int64


## 6. Export Processed Data

Save the filtered services to the new `extracted_services.ndjson` file.

In [24]:
def save_ndjson(data: List[Dict], file_path: str) -> None:
    """Save data to an NDJSON file."""
    with open(file_path, 'w', encoding='utf-8') as file:
        for item in data:
            json_line = json.dumps(item, ensure_ascii=False, separators=(',', ':'))
            file.write(json_line + '\n')

# Save the filtered services
output_path = '../config/extracted_services.ndjson'
save_ndjson(filtered_services, output_path)

print(f"Successfully saved {len(filtered_services)} filtered services to: {output_path}")

# Verify the file was created and show a few lines
if os.path.exists(output_path):
    print(f"\nFile size: {os.path.getsize(output_path)} bytes")
    
    # Read and display first few lines to verify
    with open(output_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        print(f"\nFirst 3 lines of the output file:")
        for i, line in enumerate(lines[:3], 1):
            service = json.loads(line.strip())
            print(f"{i}. Provider: {service['provider']}, Service: {service['service_name']}")
else:
    print("Error: Output file was not created!")

print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"• Original services: {len(original_services)}")
print(f"• Services marked for removal: {len(services_to_remove)}")
print(f"• Services actually removed: {len(removed_services)}")
print(f"• Final filtered services: {len(filtered_services)}")
print(f"• Output file: {output_path}")
print("="*60)

Successfully saved 589 filtered services to: ../config/extracted_services.ndjson

File size: 54270 bytes

First 3 lines of the output file:
1. Provider: AWS, Service: AwsCloudShell
2. Provider: AWS, Service: AmazonEC2
3. Provider: AWS, Service: AmazonVPC

SUMMARY
• Original services: 623
• Services marked for removal: 34
• Services actually removed: 34
• Final filtered services: 589
• Output file: ../config/extracted_services.ndjson
