In [30]:
# Specify Datamodel
from pydantic import BaseModel, Field, field_serializer
from typing import List, Optional
from datetime import datetime
class Release(BaseModel):
    name: str
    date: str

    def to_dict(self):
        return {
            "name": self.name,
            "date": self.date.isoformat()
        }

class Releases(BaseModel):
    releases: List[Release]

    def to_dict(self):
        return {
            "releases": [release.to_dict() for release in self.releases]
        }
        
class APITypeClassification(BaseModel):
    api_type: str
    confidence: float = Field(
        description="Confidence in the assessment, between 0 and 1"
    )
    explanation: str


        
class NewAPITypeClassification(BaseModel):
    api_type: str
    communication_protocol: str
    confidence: float 
    explanation: str



class IntegrationInfo(BaseModel):
    api: str
    introduction_version: str
    active_installations: float
    iot_class: str
    integration_type: Optional[NewAPITypeClassification] = Field(default=None)
    integration_type_new: NewAPITypeClassification = Field(default=None)
    content: str
    categories: List[str]
    release: Optional[Release] = Field(description="Release the integration was introduced", example={"name": "0.110", "date": "2021-01-01 18:07:34+00:00"})
    frequency_of_changes: int = 0
    last_updated: str   = ""
    creation_date: str = ""
    number_of_contributors: int = 0


class SearchResults(BaseModel):
    search_results: List[IntegrationInfo]



In [None]:
# Specify Datamodel
from pydantic import BaseModel, Field, field_serializer
from typing import List, Optional
from datetime import datetime
class Release(BaseModel):
    name: str
    date: str

    def to_dict(self):
        return {
            "name": self.name,
            "date": self.date.isoformat()
        }

class Releases(BaseModel):
    releases: List[Release]

    def to_dict(self):
        return {
            "releases": [release.to_dict() for release in self.releases]
        }
        
class APITypeClassification(BaseModel):
    api_type: str
    confidence: float = Field(
        description="Confidence in the assessment, between 0 and 1"
    )
    explanation: str


        
class NewAPITypeClassification(BaseModel):
    api_type: str
    communication_protocol: str
    confidence: float 
    explanation: str



class IntegrationInfo(BaseModel):
    api: str
    introduction_version: str
    active_installations: float
    iot_class: str
    integration_type: Optional[NewAPITypeClassification] = Field(default=None)
    integration_type_new: NewAPITypeClassification = Field(default=None)
    content: str
    categories: List[str]
    release: Optional[Release] = Field(description="Release the integration was introduced", example={"name": "0.110", "date": "2021-01-01 18:07:34+00:00"})
    frequency_of_changes: int = 0
    last_updated: str   = ""
    creation_date: str = ""
    number_of_contributors: int = 0


class SearchResults(BaseModel):
    search_results: List[IntegrationInfo]

import json
with open('../datasets/interim/iot_integrations.json', 'r') as f:
    new_integrations = json.load(f)
    
import requests

from typing import Dict
class HTMLContent(BaseModel):
    html_content: str
    issues: List[Issue]
    
html_apis: Dict[str, HTMLContent] = {}

for api in new_integrations:
    html_apis[api['url']] = HTMLContent(html_content="", issues=[])


import re
import json

PATH_TO_DATASET = "../data/processed/05-09-2024-home_assistant_iot_integrations.json"
# load dataset
with open(PATH_TO_DATASET, "r") as f:
    data = SearchResults(**json.load(f))
    
# Remove duplicates from the involved_api field
# It can be that the same API integration or parts of it the have been mentioned multiple times
# in the same issue. To remove duplicates the data is 
def clean_url(url):
    pattern = r'^(.*?)/?(?:#.*)?$'
    match = re.match(pattern, url)
    if match:
        return match.group(1)
    return url

def development_path(url: str) -> str:
    """ homeassistant/components/airthings_ble """
    component_name = url.split("www.home-assistant.io/integrations/")[1]
    return f"homeassistant/components/{component_name}"

from github import Github
from datetime import datetime
GITHUB_TOKEN="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"
def get_folder_history(owner, repo_name, path, branch='main'):
    # You need to replace 'YOUR_GITHUB_TOKEN' with your actual GitHub token
    g = Github(GITHUB_TOKEN)
    
    repo = g.get_repo(f"{owner}/{repo_name}")
    commits = repo.get_commits(path=path, sha=branch)
    
    first_commit = None
    last_commit = None
    num_commits = 0
    contributors = set()
    
    for commit in commits:
        num_commits += 1
        if commit.author:
            contributors.add(commit.author.login)
        
        if not last_commit:
            last_commit = commit
        first_commit = commit
    
    num_contributors = len(contributors)
    
    #print(f"First commit: {first_commit.commit.author.date} by {first_commit.commit.author.name}")
    #print(f"Last commit: {last_commit.commit.author.date} by {last_commit.commit.author.name}")
    #print(f"Number of commits: {num_commits}")
    #print(f"Number of contributors: {num_contributors}")
    return first_commit, last_commit, num_commits, num_contributors, contributors


# Example usage
owner = "home-assistant"
repo_name = "core"
branch = "dev"

for iot_api_integration in data.search_results:
    path = development_path(iot_api_integration.api)
    if path == "":
        print(f"Skipping {iot_api_integration.api}")
    
    try:
        first_commit, last_commit, num_commits, num_contributors, contributors = get_folder_history(owner, repo_name, path, branch)
        iot_api_integration.creation_date = first_commit.commit.author.date.isoformat()
        iot_api_integration.last_updated = last_commit.commit.author.date.isoformat()
        iot_api_integration.number_of_contributors = num_contributors
        iot_api_integration.frequency_of_changes = num_commits
    except Exception as e:
        print(f"Error processing {iot_api_integration.api}")
        print(e)
        continue
    
    print(f"Processed {iot_api_integration.api}")

In [32]:
import re
import json

PATH_TO_DATASET = "../data/processed/05-09-2024-home_assistant_iot_integrations.json"
# load dataset
with open(PATH_TO_DATASET, "r") as f:
    data = SearchResults(**json.load(f))
    
# Remove duplicates from the involved_api field
# It can be that the same API integration or parts of it the have been mentioned multiple times
# in the same issue. To remove duplicates the data is 
def clean_url(url):
    pattern = r'^(.*?)/?(?:#.*)?$'
    match = re.match(pattern, url)
    if match:
        return match.group(1)
    return url

def development_path(url: str) -> str:
    """ homeassistant/components/airthings_ble """
    component_name = url.split("www.home-assistant.io/integrations/")[1]
    return f"homeassistant/components/{component_name}"



In [7]:
url = 'https://www.home-assistant.io/integrations/asuswrt'
development_path(url)


'homeassistant/components/asuswrt'

In [5]:
data.search_results[0]

IntegrationInfo(api='https://www.home-assistant.io/integrations/asuswrt', introduction_version='0.83', active_installations=0.011000000000000001, iot_class='Local Polling', integration_type=APITypeClassification(api_type='GatewayApi', confidence=0.85, explanation="The ASUSWRT integration connects Home Assistant to an ASUS router, which indicates interaction through a gateway device (the router). The presence detection and sensor functionalities rely on the router's capabilities, supporting the classification as a GatewayApi. The mention of enabling telnet further supports this classification."), integration_type_new=None, content='\n<header>\n<div class="breadcrumbs">\n<a href="/">Home</a>\n▸ <a href="/integrations/">Integrations</a>\n▸\n</div>\n<h1 class="title indent">\nASUSWRT\n</h1>\n</header>\n<p>The ASUSWRT integration can connect Home Assistant to a ASUS router that runs on ASUSWRT firmware.</p>\n<p>There is currently support for the following device types within Home Assistant:

In [11]:
from github import Github
from datetime import datetime
GITHUB_TOKEN="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"
def get_folder_history(owner, repo_name, path, branch='main'):
    # You need to replace 'YOUR_GITHUB_TOKEN' with your actual GitHub token
    g = Github(GITHUB_TOKEN)
    
    repo = g.get_repo(f"{owner}/{repo_name}")
    commits = repo.get_commits(path=path, sha=branch)
    
    first_commit = None
    last_commit = None
    num_commits = 0
    contributors = set()
    
    for commit in commits:
        num_commits += 1
        if commit.author:
            contributors.add(commit.author.login)
        
        if not last_commit:
            last_commit = commit
        first_commit = commit
    
    num_contributors = len(contributors)
    
    #print(f"First commit: {first_commit.commit.author.date} by {first_commit.commit.author.name}")
    #print(f"Last commit: {last_commit.commit.author.date} by {last_commit.commit.author.name}")
    #print(f"Number of commits: {num_commits}")
    #print(f"Number of contributors: {num_contributors}")
    return first_commit, last_commit, num_commits, num_contributors, contributors


# Example usage
owner = "home-assistant"
repo_name = "core"
path = development_path(url)
branch = "dev"

first_commit, last_commit, num_commits, num_contributors, contributors = get_folder_history(owner, repo_name, path, branch)


print(contributors)

{'springstan', 'bskaplou', 'tkdrob', 'c0ffeeca7', 'Mariusthvdb', 'fabaff', 'cgtobi', 'pkishino', 'kennedyshead', 'jsoref', 'timmo001', 'homeassistant', 'joostlek', 'Knapoc', 'cdce8p', 'RogerSelwyn', 'balloob', 'jwater7', 'epenet', 'scop', 'JJdeVries', 'Chen-IL', 'bieniu', 'frenck', 'milanmeu', 'jpbede', 'ollo69', 'GitHub-Action', 'Danielhiversen', 'emontnemery', 'thecode', 'autinerd', 'bdraco', 'Misiu', 'Swamp-Ig'}


In [13]:
last_commit.commit.author.date.isoformat()

'2024-09-03T15:11:17+00:00'

In [14]:
for iot_api_integration in data.search_results:
    path = development_path(iot_api_integration.api)
    if path == "":
        print(f"Skipping {iot_api_integration.api}")
    
    try:
        first_commit, last_commit, num_commits, num_contributors, contributors = get_folder_history(owner, repo_name, path, branch)
        iot_api_integration.creation_date = first_commit.commit.author.date.isoformat()
        iot_api_integration.last_updated = last_commit.commit.author.date.isoformat()
        iot_api_integration.number_of_contributors = num_contributors
        iot_api_integration.frequency_of_changes = num_commits
    except Exception as e:
        print(f"Error processing {iot_api_integration.api}")
        print(e)
        continue
    
    print(f"Processed {iot_api_integration.api}")

Processed https://www.home-assistant.io/integrations/asuswrt
Processed https://www.home-assistant.io/integrations/xiaomi_miio
Processed https://www.home-assistant.io/integrations/yeelight
Processed https://www.home-assistant.io/integrations/starline
Processed https://www.home-assistant.io/integrations/vallox
Processed https://www.home-assistant.io/integrations/smartthings
Processed https://www.home-assistant.io/integrations/sun
Processed https://www.home-assistant.io/integrations/google_assistant
Processed https://www.home-assistant.io/integrations/tuya
Processed https://www.home-assistant.io/integrations/solax
Processed https://www.home-assistant.io/integrations/voip
Processed https://www.home-assistant.io/integrations/tado
Processed https://www.home-assistant.io/integrations/plex
Processed https://www.home-assistant.io/integrations/saj
Processed https://www.home-assistant.io/integrations/unifi
Processed https://www.home-assistant.io/integrations/modbus
Processed https://www.home-assi

In [17]:
special_cases_apis = {
    "https://www.home-assistant.io/integrations/climate.mqtt": "homeassistant/components/mqtt",
    "https://www.home-assistant.io/integrations/camera.ffmpeg":"homeassistant/components/ffmpeg",
    "https://www.home-assistant.io/integrations/switch.mqtt":"homeassistant/components/mqtt",
    "https://www.home-assistant.io/integrations/alarm_control_panel.mqtt": "homeassistant/components/mqtt",
    "https://www.home-assistant.io/integrations/light.mqtt": "homeassistant/components/mqtt",
    "https://www.home-assistant.io/integrations/faadelays": "homeassistant/components/faa_delays"
}

for iot_api_integration in data.search_results:
    if iot_api_integration.api not in special_cases_apis:
        continue
    path = special_cases_apis[iot_api_integration.api]    
    try:
        first_commit, last_commit, num_commits, num_contributors, contributors = get_folder_history(owner, repo_name, path, branch)
        iot_api_integration.creation_date = first_commit.commit.author.date.isoformat()
        iot_api_integration.last_updated = last_commit.commit.author.date.isoformat()
        iot_api_integration.number_of_contributors = num_contributors
        iot_api_integration.frequency_of_changes = num_commits
    except Exception as e:
        print(f"Error processing {iot_api_integration.api}")
        print(e)
        continue
    
    print(f"Processed {iot_api_integration.api}")

Processed https://www.home-assistant.io/integrations/climate.mqtt
Processed https://www.home-assistant.io/integrations/camera.ffmpeg
Processed https://www.home-assistant.io/integrations/switch.mqtt
Processed https://www.home-assistant.io/integrations/alarm_control_panel.mqtt
Processed https://www.home-assistant.io/integrations/light.mqtt
Processed https://www.home-assistant.io/integrations/faadelays


In [42]:
from openai import OpenAI
OPENAI_KEY="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"
client = OpenAI(api_key=OPENAI_KEY)


INTEGRATION_TYPE_PROMPT = """

You are an AI tasked with classifying APIs into four categories based on the content provided. The categories are DeviceApi, GatewayApi, PlatformApi, and UnknownApi.
Please repeat the prompt back as you understand it.
Specifics:
1. Determine if the API is a DeviceApi by identifying direct communication with a device, device-specific protocols, or local network access.
2. Identify a GatewayApi if the API interacts through a gateway, referencing a hub/bridge/gateway device, vendor-specific ecosystem, or local network communication.
3. Classify as PlatformApi if the API supports multiple devices across different vendors using a shared API, cloud services, OAuth authentication, or REST APIs.
4. Use UnknownApi if none of the above categories apply.
5. Consider common IoT communication protocols such as Zigbee, Z-Wave, Thread, Matter, MQTT, CoAP, HTTP.

Return a JSON object with the following structure:
{{
"api_type": string,
"communication_protocol": string,
"confidence": float,
"explanation": string
}}

The 'api_type' field should be one of 'DeviceApi', 'GatewayApi', 'PlatformApi', or 'UnknownApi'.
The 'communication_protocol' field should be a string indicating the communication protocol mentioned in the content.
The 'confidence' field should be a float between 0 and 1, indicating your confidence in the assessment.
The 'explanation' field should provide a brief rationale for your decision, referencing specific parts of the content if applicable.

Content to analyze:
{content}
Ensure your response is a valid JSON object and nothing else.

"""



def analyze_content(content: str) -> NewAPITypeClassification:
    prompt = INTEGRATION_TYPE_PROMPT.replace("{content}", content)
    response = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": "You are an AI assistant specialized in analyzing software development discussions, particularly those related to API changes. Your task is to accurately determine if the given content is about API changes and provide a structured analysis.",
            },
            {"role": "user", "content": prompt},
        ],
        temperature=0.2,
        response_format=NewAPITypeClassification,
    )

    result = json.loads(response.choices[0].message.content)
    return NewAPITypeClassification(**result)


for api in data.search_results:
    integration_info = api.content
    api.integration_type_new = analyze_content(integration_info)
    print(api.integration_type_new)
    

api_type='GatewayApi' communication_protocol='telnet' confidence=0.85 explanation="The ASUSWRT integration connects Home Assistant to an ASUS router, which indicates interaction through a gateway device (the router). The presence detection and sensor functionalities rely on the router's capabilities, supporting the classification as a GatewayApi. The mention of enabling telnet further supports this classification."
api_type='GatewayApi' communication_protocol='Zigbee, HTTP' confidence=0.95 explanation="The content discusses the Xiaomi Miio integration, which connects Home Assistant with various Xiaomi devices, including gateways and subdevices. The mention of 'Xiaomi Gateway' and the ability to control connected subdevices indicates that this API functions as a gateway, facilitating communication between multiple devices and the Home Assistant platform."
api_type='PlatformApi' communication_protocol='Wi-Fi' confidence=0.9 explanation='The Yeelight integration allows control of Yeelight

In [19]:
with open("../data/processed/05-09-2024-home_assistant_iot_integrations_updated.json", "w") as f:
    data = data.dict()
    # Drop integration_type
    for integration in data["search_results"]:
        integration.pop("integration_type")
    
    # Rename integration_type_new to integration_type
    for integration in data["search_results"]:
        integration["integration_type"] = integration.pop("integration_type_new")
        
    json.dump(data, f, indent=2)

AttributeError: 'dict' object has no attribute 'dict'

{'api': 'https://www.home-assistant.io/integrations/xiaomi_miio',
 'introduction_version': '0.51',
 'active_installations': 0.057999999999999996,
 'iot_class': 'Local Polling',
 'integration_type': {'api_type': 'GatewayApi',
  'communication_protocol': 'Zigbee, HTTP',
  'confidence': 0.95,
  'explanation': "The content discusses the Xiaomi Miio integration, which connects Home Assistant with various Xiaomi devices, including gateways and subdevices. The mention of 'Xiaomi Gateway' and the ability to control connected subdevices indicates that this API functions as a gateway, facilitating communication between multiple devices and the Home Assistant platform."},
 'integration_type_new': None,
 'content': '\n<header>\n<div class="breadcrumbs">\n<a href="/">Home</a>\n▸ <a href="/integrations/">Integrations</a>\n▸\n</div>\n<h1 class="title indent">\nXiaomi Miio\n</h1>\n</header>\n<p>The <strong>Xiaomi Miio</strong> <span class="terminology">integration<span class="terminology-tooltip">Inte

In [22]:
from copy import deepcopy
new_data = deepcopy(data)

In [38]:
commit_dict = {}
for integration in new_data['search_results']:
    commit_dict[integration["api"]] = {
        "creation_date": integration["creation_date"],
        "last_updated": integration["last_updated"],
        "number_of_contributors": integration["number_of_contributors"],
        "frequency_of_changes": integration["frequency_of_changes"]
    }
    

In [41]:
for integration in data.search_results:
    if integration.api not in commit_dict:
        continue
    integration.creation_date = commit_dict[integration.api]["creation_date"]
    integration.last_updated = commit_dict[integration.api]["last_updated"]
    integration.number_of_contributors = commit_dict[integration.api]["number_of_contributors"]
    integration.frequency_of_changes = commit_dict[integration.api]["frequency_of_changes"]
    

In [40]:
commit_dict.keys()



In [44]:
with open("../data/processed/05-09-2024-home_assistant_iot_integrations_.json", "w") as f:
    dict_data = data.dict()
    for integration in dict_data["search_results"]:
        integration.pop("integration_type_new")
    json.dump(dict_data, f, indent=2)
    