In [25]:
import json

def load_json_file(file_path):
    """Loads a JSON file."""
    with open(file_path, 'r') as file:
        return json.load(file)

def load_jsonl_file(file_path):
    """Loads a JSON Lines file and returns a list of records."""
    with open(file_path, 'r') as file:
        return [json.loads(line) for line in file]

def save_to_jsonl_file(data, file_path):
    """Saves a list of records to a JSON Lines file."""
    with open(file_path, 'w') as file:
        for record in data:
            json.dump(record, file)
            file.write('\n')

# Define ActionMapper as described previously
class ActivityProcessor:
    def __init__(self, mapping):
        self.mapping = mapping["activities"]

    def process_actions(self, actions):
        actions = sorted(actions, key=lambda x: x['date'])
        grouped_by_actor_repo = defaultdict(list)

        for action in actions:
            key = (action['actor']['id'], action['repository']['id'])
            grouped_by_actor_repo[key].append(action)

        grouped_activities = []
        for key, group in grouped_by_actor_repo.items():
            grouped_activities.extend(self.process_group(group))
        return grouped_activities

    def process_group(self, group):
        activities = []
        while group:
            current_action = group.pop(0)
            activity = self.match_activity(current_action, group)

            if activity:
                activities.append(activity)

        return activities

    def match_activity(self, action, remaining_actions):
        for activity_def in self.mapping:
            matched_actions = []
            time_window = int(activity_def['time_window'].replace('s', ''))

            for action_def in activity_def['actions']:
                if action['action'] == action_def['action']:
                    matched_actions.append(action)
                    break

            if not matched_actions:
                continue

            start_time = datetime.fromisoformat(action['date'])
            for next_action in list(remaining_actions):
                next_time = datetime.fromisoformat(next_action['date'])
                if (next_time - start_time).total_seconds() > time_window:
                    break

                if self.is_action_valid(next_action, activity_def, matched_actions):
                    matched_actions.append(next_action)
                    remaining_actions.remove(next_action)

            if not self.validate_required_actions(activity_def, matched_actions):
                continue

            return self.create_activity(activity_def, matched_actions)

        return None

    def is_action_valid(self, action, activity_def, matched_actions):
        for action_def in activity_def['actions']:
            if action['action'] == action_def['action']:
                if "validate_with" in action_def:
                    return self.validate_with(action, matched_actions, action_def["validate_with"])
                return True
        return False

    def validate_with(self, action, matched_actions, validation_rules):
        for rule in validation_rules:
            target_action = rule['target_action']
            target_fields = rule['fields']

            for matched_action in matched_actions:
                if matched_action['action'] == target_action:
                    for field_rule in target_fields:
                        source_value = self.get_nested_value(action, field_rule['field'])
                        target_value = self.get_nested_value(matched_action, field_rule['target_field'])

                        if source_value != target_value:
                            return False
        return True

    def validate_required_actions(self, activity_def, matched_actions):
        required_actions = [a['action'] for a in activity_def['actions'] if not a['optional']]
        matched_action_types = [a['action'] for a in matched_actions]
        return all(req in matched_action_types for req in required_actions)

    def create_activity(self, activity_def, matched_actions):
        start_date = matched_actions[0]['date']
        end_date = matched_actions[-1]['date']
        actor = matched_actions[0]['actor']
        repository = matched_actions[0]['repository']

        for action in matched_actions:
            del action['actor']
            del action['repository']

        return {
            "activity": activity_def['name'],
            "start_date": start_date,
            "end_date": end_date,
            "actor": actor,
            "repository": repository,
            "actions": matched_actions
        }

    def get_nested_value(self, obj, field_path):
        keys = field_path.split('.')
        for key in keys:
            if isinstance(obj, dict):
                obj = obj.get(key)
            else:
                return None
        return obj

# File Paths
activity_mapping_file = '../data/mapping/action-activity-mapping.json'
actions_file = '../data/test/actions/gh_all_actions.jsonl'
activities_output_file = '../data/test/activities/gh_all_activities-v2.jsonl'

# Process
activity_mapping = load_json_file(activity_mapping_file)
actions = load_jsonl_file(actions_file)
mapper = ActivityProcessor(activity_mapping)
activities = mapper.process_actions(actions)
save_to_jsonl_file(activities, activities_output_file)

In [None]:
import json
from datetime import datetime, timedelta
from collections import defaultdict

class ActivityProcessor:
    def __init__(self, mapping_file):
        with open(mapping_file, "r") as file:
            self.mapping = json.load(file)

    def process_actions(self, actions):
        """Process actions into activities based on the mapping."""
        grouped_actions = self.group_actions_by_actor_repo(actions)
        all_activities = []

        for (actor, repo), action_list in grouped_actions.items():
            activities = self.build_activities(action_list)
            all_activities.extend(activities)

        return all_activities

    def group_actions_by_actor_repo(self, actions):
        """Group actions by actor and repository."""
        grouped = defaultdict(list)
        for action in actions:
            key = (action['actor']['id'], action['repository']['id'])
            grouped[key].append(action)
        return grouped

    def build_activities(self, actions):
        """Build activities from a list of actions."""
        actions.sort(key=lambda x: x['date'])  # Ensure actions are sorted by date
        used_actions = set()
        activities = []

        for action in actions:
            if action['event_id'] in used_actions:
                continue

            for activity_def in self.mapping['activities']:
                matched_activity = self.match_activity(action, actions, activity_def, used_actions)
                if matched_activity:
                    activities.append(matched_activity)

        return activities

    def match_activity(self, base_action, actions, activity_def, used_actions):
        """Match an activity starting with the base action."""
        activity_actions = []
        time_window = int(activity_def['time_window'].replace('s', ''))
        start_time = datetime.strptime(base_action['date'], "%Y-%m-%dT%H:%M:%SZ")
        end_time = start_time

        for step in activity_def['actions']:
            matched_steps = self.match_step(
                base_action, actions, step, used_actions, start_time, time_window
            )

            if not matched_steps and not step['optional']:
                return None  # Required step not matched

            for matched_action in matched_steps:
                if matched_action['event_id'] not in used_actions:
                    activity_actions.append(matched_action)
                    used_actions.add(matched_action['event_id'])

                    # Extend end time
                    action_time = datetime.strptime(matched_action['date'], "%Y-%m-%dT%H:%M:%SZ")
                    end_time = max(end_time, action_time)

        if activity_actions:
            return {
                "activity": activity_def['name'],
                "start_date": base_action['date'],
                "end_date": end_time.strftime("%Y-%m-%dT%H:%M:%SZ"),
                "actor": base_action['actor'],
                "repository": base_action['repository'],
                "actions": activity_actions,
            }

        return None

    def match_step(self, base_action, actions, step, used_actions, start_time, time_window):
        """Match a single step in the activity."""
        matched_actions = []

        for action in actions:
            if action['event_id'] in used_actions:
                continue

            # Validate action type
            if action['action'] != step['action']:
                continue

            # Validate time window
            action_time = datetime.strptime(action['date'], "%Y-%m-%dT%H:%M:%SZ")
            if abs((action_time - start_time).total_seconds()) > time_window:
                continue

            # Validate fields
            if 'validate_with' in step:
                if not self.validate_action(action, base_action, step['validate_with']):
                    continue

            matched_actions.append(action)

        return matched_actions if step['repeat'] else matched_actions[:1]

    def validate_action(self, action, base_action, validations):
        """Validate an action using the validation rules."""
        for validation in validations:
            for field_map in validation['fields']:
                source_value = self.extract_field(action, field_map['field'])
                target_value = self.extract_field(base_action, field_map['target_field'])
                if source_value != target_value:
                    return False
        return True

    def extract_field(self, data, field_path):
        """Extract a nested field value from a dictionary."""
        keys = field_path.split('.')
        for key in keys:
            if not isinstance(data, dict):
                return None
            data = data.get(key)
        return data

# Example Usage
actions = [
    {"action": "PushCommits", "event_id": "41517517158", "date": "2024-09-01T00:25:06Z", "actor": {"id": 31115101, "login": "lilyminium"}, "repository": {"id": 518010502, "name": "MDAnalysis/mdakit-cookie", "organisation": "MDAnalysis", "organisation_id": 11445951}, "details": {"push": {"id": 20025231065, "ref": "refs/heads/TestMDAKit_with_host_MDAnalysis_anaconda-deps_and_no-ReadTheDocs", "commits": 1}}},
    {"action": "PushCommits", "event_id": "41517517041", "date": "2024-09-01T00:25:06Z", "actor": {"id": 31115101, "login": "lilyminium"}, "repository": {"id": 518010502, "name": "MDAnalysis/mdakit-cookie", "organisation": "MDAnalysis", "organisation_id": 11445951}, "details": {"push": {"id": 20025231000, "ref": "refs/heads/TestMDAKit_with_host_MDAnalysis_anaconda-deps_and_ReadTheDocs", "commits": 1}}},
    {"action": "PushCommits", "event_id": "41517517306", "date": "2024-09-01T00:25:07Z", "actor": {"id": 31115101, "login": "lilyminium"}, "repository": {"id": 518010502, "name": "MDAnalysis/mdakit-cookie", "organisation": "MDAnalysis", "organisation_id": 11445951}, "details": {"push": {"id": 20025231141, "ref": "refs/heads/TestMDAKit_with_host_MDAnalysis_condaforge-deps_and_ReadTheDocs", "commits": 1}}},
    {"action": "PushCommits", "event_id": "41517517456", "date": "2024-09-01T00:25:08Z", "actor": {"id": 31115101, "login": "lilyminium"}, "repository": {"id": 518010502, "name": "MDAnalysis/mdakit-cookie", "organisation": "MDAnalysis", "organisation_id": 11445951}, "details": {"push": {"id": 20025231223, "ref": "refs/heads/TestMDAKit_with_host_MDAnalysis_condaforge-deps_and_no-ReadTheDocs", "commits": 1}}}
]

mapping_file = "../data/mapping/action-activity-mapping.json"

processor = ActivityProcessor(mapping_file)
grouped_activities = processor.process_actions(actions)

print(json.dumps(grouped_activities, indent=4))

In [79]:
import json
from datetime import datetime, timedelta
from collections import defaultdict

class ActivityProcessor:
    def __init__(self, mapping_file):
        self.mapping = self.load_mapping(mapping_file)

    @staticmethod
    def load_mapping(file_path):
        """Loads the activity mapping JSON file."""
        with open(file_path, 'r') as file:
            return json.load(file)

    def process_actions(self, actions):
        """Process actions into activities."""
        # Group actions by actor and repository
        grouped_actions = defaultdict(list)
        for action in actions:
            key = (action['actor']['id'], action['repository']['id'])
            grouped_actions[key].append(action)

        # Process each group
        activities = []
        for (actor_id, repo_id), action_list in grouped_actions.items():
            activities.extend(self.match_activities(action_list))

        # Ensure all actions are consumed
        consumed_action_ids = {a['event_id'] for act in activities for a in act['actions']}
        unmatched_actions = [a for a in actions if a['event_id'] not in consumed_action_ids]
        if unmatched_actions:
            print(f"Unmatched actions: {len(unmatched_actions)}")
        return activities

    def match_activities(self, actions):
        """Match actions into activities."""
        actions = sorted(actions, key=lambda x: x['date'])  # Sort by date
        activities = []
        used_actions = set()

        for action in actions:
            if action['event_id'] in used_actions:
                continue

            # Match the activity starting with this action
            matched_activity = self.match_activity(action, actions, used_actions)
            if matched_activity:
                activities.append(matched_activity)

        return activities

    def match_activity(self, base_action, actions, used_actions):
        """Match a single activity based on the base action."""
        for activity_def in self.mapping['activities']:
            time_window = int(activity_def['time_window'].replace('s', ''))
            start_time = datetime.strptime(base_action['date'], "%Y-%m-%dT%H:%M:%SZ")
            end_time = start_time  # Initialize end_time for single-action activities
            activity_actions = []

            for step in activity_def['actions']:
                matched_steps = self.match_step(
                    base_action, actions, step, used_actions, start_time, time_window
                )
                if not matched_steps and not step['optional']:
                    break  # Required step not matched, move to next activity definition
                activity_actions.extend(matched_steps)

            if activity_actions:
                # Update end_time if multiple actions are matched
                if len(activity_actions) > 1:
                    end_time = max(
                        datetime.strptime(a['date'], "%Y-%m-%dT%H:%M:%SZ") for a in activity_actions
                    )
                used_actions.update(a['event_id'] for a in activity_actions)
                return {
                    "activity": activity_def['name'],
                    "start_date": start_time.strftime("%Y-%m-%dT%H:%M:%SZ"),
                    "end_date": end_time.strftime("%Y-%m-%dT%H:%M:%SZ"),
                    "actor": base_action['actor'],
                    "repository": base_action['repository'],
                    "actions": [
                        {k: v for k, v in action.items() if k not in ['actor', 'repository']}
                        for action in activity_actions
                    ],
                }

        return None

    def match_step(self, base_action, actions, step, used_actions, start_time, time_window):
        """Match a single step in an activity."""
        matched = []
        for action in actions:
            if action['event_id'] in used_actions:
                continue
            if action['action'] != step['action']:
                continue

            action_time = datetime.strptime(action['date'], "%Y-%m-%dT%H:%M:%SZ")
            if abs((action_time - start_time).total_seconds()) > time_window:
                continue

            # Check validation rules, if any
            if 'validate_with' in step:
                for rule in step['validate_with']:
                    if not self.validate_action(base_action, action, rule):
                        break
                else:
                    matched.append(action)
            else:
                matched.append(action)

            if not step['repeat']:
                break  # Only allow one match for non-repeating steps

        return matched

    def validate_action(self, base_action, target_action, rule):
        """Validate an action based on a rule."""
        for field_map in rule['fields']:
            base_value = self.extract_field(base_action, field_map['field'])
            target_value = self.extract_field(target_action, field_map['target_field'])
            if base_value != target_value:
                return False
        return True

    @staticmethod
    def extract_field(record, field_path):
        """Extracts a field value from a nested record."""
        fields = field_path.split('.')
        for field in fields:
            record = record.get(field)
            if record is None:
                return None
        return record

# File Paths
activity_mapping_file = '../data/mapping/action-activity-mapping.json'
actions_file = '../data/test/actions/gh_all_actions.jsonl'
activities_output_file = '../data/test/activities/gh_all_activities-v5.jsonl'

# Load actions
actions = load_jsonl_file(actions_file)

# Process actions
processor = ActivityProcessor(activity_mapping_file)
grouped_activities = processor.process_actions(actions)

# Save the resulting activities
save_to_jsonl_file(grouped_activities, activities_output_file)