In [6]:
#Task 1: How many users, activities and trackpoints are there in the dataset (after it isinserted into the database)

from DbConnector import DbConnector

# Step 1: Connect to the database
db_connector = DbConnector(DATABASE='strava_mongoDB', HOST="localhost", USER="admin", PASSWORD="admin123")
db = db_connector.db

# Collections
users_collection = db['users']
activities_collection = db['activities']

# Step 2: Count the number of users
user_count = users_collection.count_documents({})
print(f"Total number of users: {user_count}")

# Step 3: Count the number of activities
activity_count = activities_collection.count_documents({})
print(f"Total number of activities: {activity_count}")

# Step 4: Count the total number of trackpoints across all activities
# Trackpoints are embedded in the activities collection, so we need to sum them
pipeline = [
    {"$unwind": "$trackpoints"},  # Unwind the trackpoints array
    {"$group": {"_id": None, "totalTrackpoints": {"$sum": 1}}}  # Sum the total trackpoints
]

trackpoint_count_result = list(activities_collection.aggregate(pipeline))

# Extract total trackpoints from the aggregation result
trackpoint_count = trackpoint_count_result[0]['totalTrackpoints'] if trackpoint_count_result else 0
print(f"Total number of trackpoints: {trackpoint_count}")

# Step 5: Close the connection
db_connector.close_connection()


You are connected to the database: strava_mongoDB
-----------------------------------------------

Total number of users: 182
Total number of activities: 16048
Total number of trackpoints: 9681756

-----------------------------------------------
Connection to strava_mongoDB-db is closed


In [7]:
#Task 2: Find the average number of activities per user

from DbConnector import DbConnector

# Step 1: Connect to the database
db_connector = DbConnector(DATABASE='strava_mongoDB', HOST="localhost", USER="admin", PASSWORD="admin123")
db = db_connector.db

# Collections
users_collection = db['users']

# Step 2: Retrieve the total number of activities for each user
pipeline = [
    {
        "$project": {
            "num_activities": {"$size": "$activity_ids"}  # Get the size of the activity_ids array for each user
        }
    },
    {
        "$group": {
            "_id": None,
            "total_activities": {"$sum": "$num_activities"},  # Sum up the total number of activities
            "total_users": {"$sum": 1}  # Count the total number of users
        }
    }
]

# Step 3: Run the aggregation pipeline
result = list(users_collection.aggregate(pipeline))

# Step 4: Calculate the average number of activities per user
if result and result[0]['total_users'] > 0:
    total_activities = result[0]['total_activities']
    total_users = result[0]['total_users']
    average_activities_per_user = total_activities / total_users
    print(f"Average number of activities per user: {average_activities_per_user:.2f}")
else:
    print("No users found or no activities data.")

# Step 5: Close the connection
db_connector.close_connection()



You are connected to the database: strava_mongoDB
-----------------------------------------------

Average number of activities per user: 88.18

-----------------------------------------------
Connection to strava_mongoDB-db is closed


In [10]:
#Task 3: 
# Find the top 20 users with the highest number of activities. 

# Ensure this part is included at the start of your code
from DbConnector import DbConnector

# Step 1: Use DbConnector to connect to the database
db_connector = DbConnector(DATABASE='strava_mongoDB', HOST="localhost", USER="admin", PASSWORD="admin123")

# Collections
db = db_connector.db
users_collection = db['users']
activities_collection = db['activities']

# Step 9: Find the top 20 users with the highest number of activities
pipeline = [
    {
        "$project": {
            "_id": 1,  # Include user id in the result
            "num_activities": { "$size": "$activity_ids" }  # Calculate the number of activities per user
        }
    },
    {
        "$sort": { "num_activities": -1 }  # Sort by the number of activities in descending order
    },
    {
        "$limit": 20  # Limit to the top 20 users
    }
]

# Run the aggregation query
top_users = list(users_collection.aggregate(pipeline))

# Display the top 20 users with the highest number of activities
for user in top_users:
    print(f"User ID: {user['_id']}, Number of Activities: {user['num_activities']}")





You are connected to the database: strava_mongoDB
-----------------------------------------------

User ID: 128, Number of Activities: 2102
User ID: 153, Number of Activities: 1793
User ID: 25, Number of Activities: 715
User ID: 163, Number of Activities: 704
User ID: 62, Number of Activities: 691
User ID: 144, Number of Activities: 563
User ID: 41, Number of Activities: 399
User ID: 85, Number of Activities: 364
User ID: 4, Number of Activities: 346
User ID: 140, Number of Activities: 345
User ID: 167, Number of Activities: 320
User ID: 68, Number of Activities: 280
User ID: 17, Number of Activities: 265
User ID: 3, Number of Activities: 261
User ID: 14, Number of Activities: 236
User ID: 126, Number of Activities: 215
User ID: 30, Number of Activities: 210
User ID: 112, Number of Activities: 208
User ID: 11, Number of Activities: 201
User ID: 39, Number of Activities: 198


In [11]:
# Task 4: 
# Find all users who have taken a taxi



# Step 1: Query the activities collection to find activities where transportation_mode is "taxi"
taxi_activities = activities_collection.find({"transportation_mode": "taxi"}, {"_id": 1})

# Step 2: Extract the activity IDs of taxi activities
taxi_activity_ids = [activity["_id"] for activity in taxi_activities]

# Step 3: Query the users collection to find users whose activity_ids include any taxi activity IDs
taxi_users = users_collection.find({"activity_ids": {"$in": taxi_activity_ids}})

# Step 4: Display the user IDs of users who have taken a taxi
print("Users who have taken a taxi:")
for user in taxi_users:
    print(f"User ID: {user['_id']}")


Users who have taken a taxi:
User ID: 10
User ID: 21
User ID: 52
User ID: 56
User ID: 58
User ID: 62
User ID: 65
User ID: 78
User ID: 80
User ID: 84
User ID: 85
User ID: 98
User ID: 111
User ID: 114
User ID: 126
User ID: 128
User ID: 139
User ID: 153
User ID: 163
User ID: 167
User ID: 175


In [12]:
# Task 5:

# Find all types of transportation modes and count how many activities that are
# tagged with these transportation mode labels. Do not count the rows where
# the mode is null.



pipeline = [
    {
        # Step 1: Match documents where transportation_mode is not null
        "$match": {"transportation_mode": {"$ne": None}}
    },
    {
        # Step 2: Group by transportation_mode and count the number of activities for each mode
        "$group": {
            "_id": "$transportation_mode",  # Group by transportation_mode
            "count": {"$sum": 1}  # Count the number of activities for each mode
        }
    },
    {
        # Step 3: Sort by count in descending order (optional, to get the most common modes first)
        "$sort": {"count": -1}
    }
]

# Run the aggregation pipeline
transportation_modes = list(activities_collection.aggregate(pipeline))

# Display the results
print("Transportation modes and their activity counts:")
for mode in transportation_modes:
    print(f"Mode: {mode['_id']}, Count: {mode['count']}")




Transportation modes and their activity counts:
Mode: walk, Count: 1008
Mode: bike, Count: 619
Mode: car, Count: 493
Mode: bus, Count: 471
Mode: subway, Count: 190
Mode: taxi, Count: 125
Mode: airplane, Count: 4
Mode: train, Count: 2
Mode: run, Count: 1
Mode: boat, Count: 1


In [13]:
# 6a
# Find the year with the most activities


# MongoDB aggregation pipeline to find the year with the most activities
pipeline = [
    {
        "$group": {
            "_id": {"$year": "$start_date_time"},  # Extract year from start_date_time
            "count": {"$sum": 1}  # Count the number of activities in each year
        }
    },
    {
        "$sort": {"count": -1}  # Sort by count in descending order
    },
    {
        "$limit": 1  # Get only the top year with the most activities
    }
]

# Execute the pipeline
result = list(activities_collection.aggregate(pipeline))

# Display the result
if result:
    most_active_year = result[0]['_id']
    activity_count = result[0]['count']
    print(f"The year with the most activities is {most_active_year} with {activity_count} activities.")
else:
    print("No activities found.")




The year with the most activities is 2008 with 5895 activities.


In [22]:
# 6b
#Is this also the year with most recorded hours?

# MongoDB aggregation pipeline to find the year with the most recorded hours
pipeline = [
    {
        "$addFields": {
            "duration_hours": {
                "$divide": [
                    {"$subtract": ["$end_date_time", "$start_date_time"]},
                    1000 * 60 * 60  # Convert milliseconds to hours
                ]
            }
        }
    },
    {
        "$group": {
            "_id": {"$year": "$start_date_time"},  # Group by year
            "total_hours": {"$sum": "$duration_hours"},  # Sum the duration for each year
            "activity_count": {"$sum": 1}  # Also count the number of activities per year
        }
    },
    {
        "$sort": {"total_hours": -1}  # Sort by total recorded hours in descending order
    },
    {
        "$limit": 1  # Limit to the top year with the most recorded hours
    }
]

# Execute the pipeline
result = list(activities_collection.aggregate(pipeline))

# Display the result
if result:
    year_with_most_hours = result[0]['_id']
    total_hours = result[0]['total_hours']
    activity_count_in_most_hours_year = result[0]['activity_count']
    print(f"The year with the most recorded hours is {year_with_most_hours} with {total_hours} hours.")
    
    # Compare with the year with the most activities
    if most_active_year == year_with_most_hours:
        print("Yes, this is also the year with the most activities.")
    else:
        print("No, this is a different year than the year with the most activities.")
else:
    print("No activities found.")




The year with the most recorded hours is 2009 with 11612.423888888889 hours.
No, this is a different year than the year with the most activities.


In [26]:
# 7
# Find the total distance (in km) walked in 2008, by user with id=112.


# Step 1: Retrieve the activity IDs for user 112 from the users collection
user = users_collection.find_one({"_id": "112"})

# Step 2: Check if the user has activity_ids, then filter the activities by year and mode
if user and "activity_ids" in user:
    activity_ids = user["activity_ids"]
    
    # Step 3: Use aggregation to filter for 2008 and "walk" mode, then sum distances
    pipeline = [
        {
            "$match": {
                "_id": {"$in": activity_ids},  # Filter by user's activity_ids
                "transportation_mode": "walk",
                "$expr": { "$eq": [{ "$year": "$start_date_time" }, 2008] }  # Filter for the year 2008
            }
        },
        {
            "$group": {
                "_id": None,
                "total_distance_walked_km": {"$sum": "$total_distance_km"}  # Sum up total_distance_km
            }
        }
    ]
    
    # Execute the pipeline
    result = list(activities_collection.aggregate(pipeline))
    
    # Display the result
    if result:
        total_distance_walked = result[0]['total_distance_walked_km']
        print(f"The total distance walked by user 112 in 2008 is {total_distance_walked} km.")
    else:
        print("No walking activities found for user 112 in 2008.")
else:
    print("User 112 does not exist or has no associated activities.")



The total distance walked by user 112 in 2008 is 256.3828245787455 km.


In [29]:
# 8
# Find the top 20 users who have gained the most altitude meters.




In [32]:
# 9
# Find all users who have invalid activities, and the number of invalid activities per user 



# Step 1: Prepare an empty list to store users with their invalid activity counts
user_invalid_activity_counts = []

# Step 2: Loop over each user and process their activities
for user in users_collection.find({}, {"_id": 1, "activity_ids": 1}):
    user_id = user["_id"]
    activity_ids = user.get("activity_ids", [])

    # Step 3: Define the pipeline to detect invalid activities for the current user's activities
    pipeline = [
        {
            "$match": {
                "_id": {"$in": activity_ids}  # Match only the current user's activities
            }
        },
        {
            "$project": {
                "trackpoints": "$trackpoints.date_time"
            }
        },
        {
            "$project": {
                "invalid_activity": {
                    "$anyElementTrue": {
                        "$map": {
                            "input": { "$range": [1, { "$size": "$trackpoints" }] },
                            "as": "idx",
                            "in": {
                                "$gte": [
                                    { "$subtract": [
                                        { "$arrayElemAt": ["$trackpoints", "$$idx"] },
                                        { "$arrayElemAt": ["$trackpoints", { "$subtract": ["$$idx", 1] }] }
                                    ] },
                                    5 * 60 * 1000  # 5 minutes in milliseconds
                                ]
                            }
                        }
                    }
                }
            }
        },
        {
            "$match": {
                "invalid_activity": True  # Only include invalid activities
            }
        },
        {
            "$count": "invalid_activity_count"  # Count the number of invalid activities
        }
    ]

    # Step 4: Execute the pipeline and capture the result
    result = list(activities_collection.aggregate(pipeline))

    # Step 5: Store user ID and invalid activity count if any invalid activities are found
    if result:
        invalid_count = result[0]["invalid_activity_count"]
        user_invalid_activity_counts.append({"user_id": user_id, "invalid_activity_count": invalid_count})

# Step 6: Display the result
for user in user_invalid_activity_counts:
    print(f"User ID: {user['user_id']}, Invalid Activity Count: {user['invalid_activity_count']}")


User ID: 0, Invalid Activity Count: 101
User ID: 1, Invalid Activity Count: 45
User ID: 2, Invalid Activity Count: 98
User ID: 3, Invalid Activity Count: 179
User ID: 4, Invalid Activity Count: 219
User ID: 5, Invalid Activity Count: 45
User ID: 6, Invalid Activity Count: 17
User ID: 7, Invalid Activity Count: 30
User ID: 8, Invalid Activity Count: 16
User ID: 9, Invalid Activity Count: 31
User ID: 10, Invalid Activity Count: 50
User ID: 11, Invalid Activity Count: 32
User ID: 12, Invalid Activity Count: 43
User ID: 13, Invalid Activity Count: 29
User ID: 14, Invalid Activity Count: 118
User ID: 15, Invalid Activity Count: 46
User ID: 16, Invalid Activity Count: 20
User ID: 17, Invalid Activity Count: 129
User ID: 18, Invalid Activity Count: 27
User ID: 19, Invalid Activity Count: 31
User ID: 20, Invalid Activity Count: 20
User ID: 21, Invalid Activity Count: 7
User ID: 22, Invalid Activity Count: 55
User ID: 23, Invalid Activity Count: 11
User ID: 24, Invalid Activity Count: 27
User I

In [36]:
# 10 
# Find the users who have tracked an activity in the Forbidden City of Beijing. 


# Define the latitude and longitude range for Forbidden City (±0.001 tolerance)
lat_range = [39.916 - 0.001, 39.916 + 0.001]
lon_range = [116.397 - 0.001, 116.397 + 0.001]

# Aggregation pipeline to find unique users with activities in the specified coordinate range
pipeline = [
    {
        "$match": {
            "trackpoints": {
                "$elemMatch": {
                    "lat": { "$gte": lat_range[0], "$lte": lat_range[1] },
                    "lon": { "$gte": lon_range[0], "$lte": lon_range[1] }
                }
            }
        }
    },
    {
        "$lookup": {
            "from": "users",
            "localField": "_id",
            "foreignField": "activity_ids",
            "as": "user_data"
        }
    },
    {
        "$unwind": "$user_data"  # Unwind to get user data
    },
    {
        "$group": {
            "_id": "$user_data._id"  # Group by user_id to get unique users only
        }
    }
]

# Execute the pipeline
result = list(activities_collection.aggregate(pipeline))

# Display results
if result:
    print("Users who have tracked an activity near the Forbidden City:")
    for user in result:
        print(f"User ID: {user['_id']}")
else:
    print("No users have tracked activities near the Forbidden City.")




Users who have tracked an activity near the Forbidden City:
User ID: 131
User ID: 4
User ID: 19
User ID: 18
