In [11]:
#Task 1: How many users, activities and trackpoints are there in the dataset (after it isinserted into the database)

from DbConnector import DbConnector

# Step 1: Connect to the database
db_connector = DbConnector(DATABASE='strava_mongoDB', HOST="localhost", USER="admin", PASSWORD="admin123")
db = db_connector.db

# Collections
users_collection = db['users']
activities_collection = db['activities']

# Step 2: Count the number of users
user_count = users_collection.count_documents({})
print(f"Total number of users: {user_count}")

# Step 3: Count the number of activities
activity_count = activities_collection.count_documents({})
print(f"Total number of activities: {activity_count}")

# Step 4: Count the total number of trackpoints across all activities
# Trackpoints are embedded in the activities collection, so we need to sum them
pipeline = [
    {"$unwind": "$trackpoints"},  # Unwind the trackpoints array
    {"$group": {"_id": None, "totalTrackpoints": {"$sum": 1}}}  # Sum the total trackpoints
]

trackpoint_count_result = list(activities_collection.aggregate(pipeline))

# Extract total trackpoints from the aggregation result
trackpoint_count = trackpoint_count_result[0]['totalTrackpoints'] if trackpoint_count_result else 0
print(f"Total number of trackpoints: {trackpoint_count}")

# Step 5: Close the connection
db_connector.close_connection()

You are connected to the database: strava_mongoDB
-----------------------------------------------

Total number of users: 182
Total number of activities: 16048
Total number of trackpoints: 9681756

-----------------------------------------------
Connection to strava_mongoDB-db is closed


In [14]:
#Task 2: Find the average number of activities per user

from DbConnector import DbConnector

# Step 1: Connect to the database
db_connector = DbConnector(DATABASE='strava_mongoDB', HOST="localhost", USER="admin", PASSWORD="admin123")
db = db_connector.db

# Collections
users_collection = db['users']

# Step 2: Retrieve the total number of activities for each user
pipeline = [
    {
        "$project": {
            "num_activities": {"$size": "$activity_ids"}  # Get the size of the activity_ids array for each user
        }
    },
    {
        "$group": {
            "_id": None,
            "total_activities": {"$sum": "$num_activities"},  # Sum up the total number of activities
            "total_users": {"$sum": 1}  # Count the total number of users
        }
    }
]

# Step 3: Run the aggregation pipeline
result = list(users_collection.aggregate(pipeline))

# Step 4: Calculate the average number of activities per user
if result and result[0]['total_users'] > 0:
    total_activities = result[0]['total_activities']
    total_users = result[0]['total_users']
    average_activities_per_user = total_activities / total_users
    print(f"Average number of activities per user: {average_activities_per_user:.2f}")
else:
    print("No users found or no activities data.")

# Step 5: Close the connection
db_connector.close_connection()



You are connected to the database: strava_mongoDB
-----------------------------------------------

Average number of activities per user: 88.18

-----------------------------------------------
Connection to strava_mongoDB-db is closed
yolo


In [16]:
#Task 3: 
# Find the top 20 users with the highest number of activities. 

# Ensure this part is included at the start of your code
from DbConnector import DbConnector

# Step 1: Use DbConnector to connect to the database
db_connector = DbConnector(DATABASE='strava_mongoDB', HOST="localhost", USER="admin", PASSWORD="admin123")

# Collections
db = db_connector.db
users_collection = db['users']
activities_collection = db['activities']

# Step 9: Find the top 20 users with the highest number of activities
pipeline = [
    {
        "$project": {
            "_id": 1,  # Include user id in the result
            "num_activities": { "$size": "$activity_ids" }  # Calculate the number of activities per user
        }
    },
    {
        "$sort": { "num_activities": -1 }  # Sort by the number of activities in descending order
    },
    {
        "$limit": 20  # Limit to the top 20 users
    }
]

# Run the aggregation query
top_users = list(users_collection.aggregate(pipeline))

# Display the top 20 users with the highest number of activities
for user in top_users:
    print(f"User ID: {user['_id']}, Number of Activities: {user['num_activities']}")





You are connected to the database: strava_mongoDB
-----------------------------------------------

User ID: 128, Number of Activities: 2102
User ID: 153, Number of Activities: 1793
User ID: 25, Number of Activities: 715
User ID: 163, Number of Activities: 704
User ID: 62, Number of Activities: 691
User ID: 144, Number of Activities: 563
User ID: 41, Number of Activities: 399
User ID: 85, Number of Activities: 364
User ID: 4, Number of Activities: 346
User ID: 140, Number of Activities: 345
User ID: 167, Number of Activities: 320
User ID: 68, Number of Activities: 280
User ID: 17, Number of Activities: 265
User ID: 3, Number of Activities: 261
User ID: 14, Number of Activities: 236
User ID: 126, Number of Activities: 215
User ID: 30, Number of Activities: 210
User ID: 112, Number of Activities: 208
User ID: 11, Number of Activities: 201
User ID: 39, Number of Activities: 198
heihei


In [17]:
# Task 4: 
# Find all users who have taken a taxi



# Step 1: Query the activities collection to find activities where transportation_mode is "taxi"
taxi_activities = activities_collection.find({"transportation_mode": "taxi"}, {"_id": 1})

# Step 2: Extract the activity IDs of taxi activities
taxi_activity_ids = [activity["_id"] for activity in taxi_activities]

# Step 3: Query the users collection to find users whose activity_ids include any taxi activity IDs
taxi_users = users_collection.find({"activity_ids": {"$in": taxi_activity_ids}})

# Step 4: Display the user IDs of users who have taken a taxi
print("Users who have taken a taxi:")
for user in taxi_users:
    print(f"User ID: {user['_id']}")


Users who have taken a taxi:
User ID: 10
User ID: 21
User ID: 52
User ID: 56
User ID: 58
User ID: 62
User ID: 65
User ID: 78
User ID: 80
User ID: 84
User ID: 85
User ID: 98
User ID: 111
User ID: 114
User ID: 126
User ID: 128
User ID: 139
User ID: 153
User ID: 163
User ID: 167
User ID: 175


In [18]:
# Task 5:

# Find all types of transportation modes and count how many activities that are
# tagged with these transportation mode labels. Do not count the rows where
# the mode is null.



pipeline = [
    {
        # Step 1: Match documents where transportation_mode is not null
        "$match": {"transportation_mode": {"$ne": None}}
    },
    {
        # Step 2: Group by transportation_mode and count the number of activities for each mode
        "$group": {
            "_id": "$transportation_mode",  # Group by transportation_mode
            "count": {"$sum": 1}  # Count the number of activities for each mode
        }
    },
    {
        # Step 3: Sort by count in descending order (optional, to get the most common modes first)
        "$sort": {"count": -1}
    }
]

# Run the aggregation pipeline
transportation_modes = list(activities_collection.aggregate(pipeline))

# Display the results
print("Transportation modes and their activity counts:")
for mode in transportation_modes:
    print(f"Mode: {mode['_id']}, Count: {mode['count']}")




Transportation modes and their activity counts:
Mode: walk, Count: 1008
Mode: bike, Count: 619
Mode: car, Count: 493
Mode: bus, Count: 471
Mode: subway, Count: 190
Mode: taxi, Count: 125
Mode: airplane, Count: 4
Mode: train, Count: 2
Mode: run, Count: 1
Mode: boat, Count: 1


In [23]:
# 6a
# Find the year with the most activities








NameError: name 'activities' is not defined