In [3]:
import json
from datetime import datetime
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


class ActionMapper:
    def __init__(self, activity_mapping):
        self.activity_mapping = self.preprocess_activities(activity_mapping)

    @staticmethod
    def preprocess_activities(activity_mapping):
        """Preprocess activities to extract required and optional actions."""
        for activity in activity_mapping["activities"]:
            actions = activity["actions"]
            activity["required_actions"] = {a["action"] for a in actions if not a.get("optional", False)}
            activity["optional_actions"] = {a["action"] for a in actions if a.get("optional", False)}
        return activity_mapping

    @staticmethod
    def parse_time(iso_string):
        """Convert ISO 8601 string to a datetime object."""
        return datetime.fromisoformat(iso_string.replace("Z", ""))

    @staticmethod
    def calculate_time_diff(start, end):
        """Calculate time difference in seconds between two timestamps."""
        return (ActionMapper.parse_time(end) - ActionMapper.parse_time(start)).total_seconds()

    @staticmethod
    def within_time_limit(start_time, end_time, limit):
        """Check if time difference is within the allowed time window."""
        return ActionMapper.calculate_time_diff(start_time, end_time) <= int(limit.replace("s", ""))

    @staticmethod
    def get_nested_value(data, field):
        """Retrieve a nested field value using dot notation."""
        for key in field.split('.'):
            data = data.get(key)
            if data is None:
                return None
        return data

    def group_actions(self, actions):
        """Group actions by (actor ID, repository ID)."""
        grouped = {}
        for action in actions:
            key = (action["actor"]["id"], action["repository"]["id"])
            grouped.setdefault(key, []).append(action)
        return grouped

    def validate_action(self, action, gathered_actions):
        """Validate an action based on its `validate_with` rules."""
        for rule in action.get("validate_with", []):
            if not any(
                all(
                    self.get_nested_value(action["details"], field["field"]) ==
                    self.get_nested_value(target["details"], field["target_field"])
                    for field in rule["fields"]
                )
                for target in gathered_actions if target["action"] == rule["target_action"]
            ):
                logging.warning(f"Validation failed for {action['action']}")
                return False
        return True

    def gather_actions(self, action_list, start_idx, activity):
        """Gather actions that match an activity within the time window."""
        gathered = [action_list[start_idx]]
        found = {action_list[start_idx]["action"]}
        time_window = activity["time_window"]

        for next_action in action_list[start_idx + 1:]:
            if not self.within_time_limit(gathered[-1]["date"], next_action["date"], time_window):
                break
            if next_action["action"] in activity["required_actions"] | activity["optional_actions"]:
                if self.validate_action(next_action, gathered):
                    gathered.append(next_action)
                    found.add(next_action["action"])

        return gathered, found

    def map_actions_to_activities(self, actions):
        """Map actions to activities based on the schema."""
        activities = []
        for _, action_list in self.group_actions(actions).items():
            action_list.sort(key=lambda x: x["date"])
            i = 0
            while i < len(action_list):
                for activity in self.activity_mapping["activities"]:
                    if action_list[i]["action"] not in activity["required_actions"] | activity["optional_actions"]:
                        continue
                    gathered, found = self.gather_actions(action_list, i, activity)
                    if activity["required_actions"].issubset(found):
                        activities.append(self.create_activity(activity["name"], gathered))
                        i += len(gathered) - 1
                        break
                i += 1
        return sorted(activities, key=lambda x: x["start_date"])

    @staticmethod
    def create_activity(name, actions):
        """Create a structured activity record."""
        return {
            "activity": name,
            "start_date": actions[0]["date"],
            "end_date": actions[-1]["date"],
            "actor": actions[0]["actor"],
            "repository": actions[0]["repository"],
            "actions": [{k: action[k] for k in ("action", "event_id", "date", "details")} for action in actions],
        }


# Utility Functions
def load_json_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def load_jsonl_file(file_path):
    with open(file_path, 'r') as file:
        return [json.loads(line) for line in file]

def save_to_jsonl_file(data, file_path):
    with open(file_path, 'w') as file:
        for item in data:
            json.dump(item, file)
            file.write('\n')

In [4]:
# File Paths
activity_mapping_file = 'mapping/action-activity-mapping.json'
actions_file = '../data/datasets/actions/gh_all_actions.jsonl'
activities_output_file = '../data/datasets/activities/gh_all_activities.jsonl'

# Process
activity_mapping = load_json_file(activity_mapping_file)
actions = load_jsonl_file(actions_file)
mapper = ActionMapper(activity_mapping)
activities = mapper.map_actions_to_activities(actions)
save_to_jsonl_file(activities, activities_output_file)