In [1]:
!pip install trackintel
import pandas as pd
import trackintel as ti
import gzip
import geopandas as gpd
from pathlib import Path

Defaulting to user installation because normal site-packages is not writeable


In [2]:
data_paths = {
    'city_a': "./cityA_groundtruthdata.csv.gz",
    'city_b': "./cityB_challengedata.csv.gz",
    'city_c': "./cityC_challengedata.csv.gz",
    'city_d': "./cityD_challengedata.csv.gz"
}


In [3]:
# Function to load and decompress .gz files into a DataFrame
def load_compressed_csv(file_path):
    with gzip.open(file_path, 'rt') as gz_file:
        return pd.read_csv(gz_file)

In [4]:
# Function to clean and preprocess the data
def process_city_data(city_key):
    # Load data from compressed file
    print(f"Loading data for {city_key}...")
    df = load_compressed_csv(data_paths[city_key])

    # Remove rows with invalid coordinates (-999)
    print("Cleaning data...")
    valid_data = df[(df['x'] != -999) & (df['y'] != -999)].copy()

    # Combine date ('d') and time ('t') into a 'tracked_at' datetime column
    valid_data['date'] = pd.to_datetime(valid_data['d'], format='%j', errors='coerce')
    valid_data['time_offset'] = pd.to_timedelta(valid_data['t'] * 30, unit='m')
    valid_data['tracked_at'] = valid_data['date'] + valid_data['time_offset']
    valid_data['tracked_at'] = valid_data['tracked_at'].dt.tz_localize('UTC')

    # Drop intermediate columns
    valid_data.drop(columns=['date', 'time_offset'], inplace=True)

    # Rename columns for consistency
    valid_data.rename(columns={'uid': 'user_id', 'x': 'longitude', 'y': 'latitude'}, inplace=True)

    # Filter data to only include the first 30 days (first month)
    print("Filtering data for the first 30 days...")
    start_date = valid_data['tracked_at'].min()  # Get the earliest date in the data
    end_date = start_date + pd.Timedelta(days=30)  # Set the cutoff for 30 days after the start date
    valid_data = valid_data[(valid_data['tracked_at'] >= start_date) & (valid_data['tracked_at'] < end_date)]

    # Save cleaned data
    output_file = Path(f"processed_data_{city_key}.csv")
    valid_data.to_csv(output_file, index=False)
    print(f"Processed data saved to {output_file}")

In [5]:
# Function to write triplegs data to a CSV file
def export_triplegs_to_csv(triplegs, output_file, **kwargs):
    """Exports triplegs as WKT format to a CSV file."""
    triplegs_df = triplegs.to_wkt(rounding_precision=-1, trim=False)
    triplegs_df.to_csv(output_file, **kwargs)

In [6]:
# Function to preprocess data and generate triplegs
def create_triplegs(city_key):
    # Preprocess data
    process_city_data(city_key)

    # Load preprocessed data into positionfixes
    preprocessed_file = Path(f"processed_data_{city_key}.csv")
    print(f"Reading preprocessed data for {city_key}...")
    positionfixes = ti.read_positionfixes_csv(preprocessed_file)

    # Generate staypoints from positionfixes
    print("Identifying staypoints...")
    positionfixes, staypoints = positionfixes.as_positionfixes.generate_staypoints(
        method='sliding',
        dist_threshold=1,         # Distance threshold in meters
        time_threshold=90,        # Time threshold in minutes
        gap_threshold=300,        # Gap threshold in minutes
        distance_metric='haversine',
        include_last=True,
        exclude_duplicate_pfs=True,
        print_progress=True,
        n_jobs=-1
    )

    # Generate triplegs between staypoints
    print("Generating triplegs...")
    positionfixes, triplegs = ti.preprocessing.generate_triplegs(
        positionfixes, staypoints, method='between_staypoints', gap_threshold=90
    )

    # Export triplegs to CSV
    triplegs_file = Path(f"triplegs_{city_key}.csv")
    export_triplegs_to_csv(triplegs, triplegs_file, index=False)
    print(f"Triplegs exported to {triplegs_file}")


In [7]:
create_triplegs('city_b')

Loading data for city_b...
Cleaning data...
Filtering data for the first 30 days...
Processed data saved to processed_data_city_b.csv
Reading preprocessed data for city_b...




Identifying staypoints...


100%|████████████████████████████████████████████████████████████████████████████| 24906/24906 [05:12<00:00, 79.75it/s]


Generating triplegs...




Triplegs exported to triplegs_city_b.csv


In [8]:
import pandas as pd
import re
from shapely.wkt import loads
from collections import defaultdict
from pathlib import Path

# Step 1: Extract coordinates from LINESTRING using shapely
def extract_coordinates_from_linestring(linestring):
    """
    Extract coordinates from a LINESTRING WKT.
    """
    geometry = loads(linestring)  # Parse the WKT string into a geometry object using shapely
    return list(geometry.coords)   # Extract the coordinates as a list of (x, y) tuples

# Step 2: Load and preprocess the triplegs data
def get_tripleg_sequences(triplegs_file):
    """
    Extracts the tripleg sequences from the triplegs CSV file by parsing the LINESTRING column.
    Each row contains a trip, and each trip is a sequence of coordinates.
    """
    # Read the triplegs CSV file into a pandas DataFrame
    triplegs = pd.read_csv(triplegs_file)

    sequences = []
    for _, row in triplegs.iterrows():
        # Extract coordinates from the LINESTRING column
        coordinates = extract_coordinates_from_linestring(row['geom'])
        sequences.append(coordinates)  # Each tripleg becomes a sequence of (x, y) pairs
    return sequences

# Step 3: Implement the GSP algorithm to mine sequential patterns
def gsp_mine_sequential_patterns(sequences, min_support, max_pattern_length):
    """
    Mines frequent sequential patterns using the Generalized Sequential Pattern (GSP) algorithm.
    """
    num_sequences = len(sequences)
    
    # Step 3.1: Generate frequent patterns of length 1 (individual (x, y) coordinates)
    item_counts = defaultdict(int)
    for sequence in sequences:
        for item in sequence:
            item_counts[item] += 1

    # Step 3.2: Create the list of frequent patterns (length-1 sequences)
    frequent_patterns = {1: []}
    for item, count in item_counts.items():
        support = count / num_sequences
        if support >= min_support:
            frequent_patterns[1].append((item, support))

    # Step 3.3: Generate patterns of length 2, 3, ..., max_pattern_length
    for length in range(2, max_pattern_length + 1):
        candidates = generate_candidates(frequent_patterns[length - 1], length)
        candidate_counts = defaultdict(int)

        # Count support for each candidate sequence
        for sequence in sequences:
            for candidate in candidates:
                if is_subsequence(candidate, sequence):
                    candidate_counts[candidate] += 1

        # Store frequent patterns of the current length
        frequent_patterns[length] = []
        for candidate, count in candidate_counts.items():
            support = count / num_sequences
            if support >= min_support:
                frequent_patterns[length].append((candidate, support))

        # If no frequent patterns were found for this length, stop
        if len(frequent_patterns[length]) == 0:
            break

    return frequent_patterns

# Helper function to generate candidate sequences from frequent patterns
def generate_candidates(frequent_patterns, length):
    """
    Generate candidate sequences of a given length from the frequent patterns of the previous length.
    """
    candidates = []
    for i in range(len(frequent_patterns)):
        for j in range(i, len(frequent_patterns)):
            candidate = list(frequent_patterns[i]) + list(frequent_patterns[j][1:])
            candidates.append(tuple(candidate))
    return candidates

# Helper function to check if a pattern is a subsequence of a sequence
def is_subsequence(pattern, sequence):
    """
    Check if a given pattern is a subsequence of a sequence.
    """
    it = iter(sequence)
    return all(item in it for item in pattern)

# Step 4: Export the frequent patterns to a file
def export_frequent_patterns(frequent_patterns, output_file):
    """
    Export the frequent patterns to a CSV file.
    """
    with open(output_file, 'w') as f:
        for length, patterns in frequent_patterns.items():
            f.write(f"Length-{length} Patterns:\n")
            for pattern, support in patterns:
                f.write(f"{pattern}: {support:.4f}\n")
            f.write("\n")

# Step 5: Main function to run the process and mine patterns
def create_and_mine_triplegs(city_key, min_support, max_pattern_length):
    """
    Main function to process the tripleg data, mine sequential patterns, and export results.
    """
    # Path to the triplegs file generated by your `create_triplegs` function
    triplegs_file = Path(f"triplegs_{city_key}.csv")
    
    # Step 1: Convert triplegs to sequences of (x, y) pairs
    sequences = get_tripleg_sequences(triplegs_file)
    
    # Step 2: Run the GSP algorithm to find frequent patterns
    frequent_patterns = gsp_mine_sequential_patterns(sequences, min_support, max_pattern_length)
    print(frequent_patterns)
    # Step 3: Export the frequent patterns to a file
    export_frequent_patterns(frequent_patterns, f"frequent_patterns_{city_key}.csv")
    print(f"Frequent patterns have been saved to 'frequent_patterns_{city_key}.csv'.")

# Example usage
city_key = 'city_b'  # Example city key
# create_and_mine_triplegs(city_key, min_support=0.1, max_pattern_length=2)


In [9]:
triplegs_file = Path(f"triplegs_{city_key}.csv")
    
# Convert triplegs to sequences of (x, y) pairs

sequences = get_tripleg_sequences(triplegs_file)


In [10]:
import csv

def save_sequences_to_csv(sequences, filename):
    with open(filename, 'w', newline='') as f:
        writer = csv.writer(f)
        for seq in sequences:
            # Flatten the sequence into individual pairs and write to CSV
            writer.writerow([f'({x[0]},{x[1]})' for x in seq])

# Example usage:
save_sequences_to_csv(sequences, 'sequences_b.csv')
