## Imports

In [46]:
import base64
import datetime
import matplotlib.pyplot as plt
import pandas as pd
import yaml
import os
import yt_dlp


from dateutil import relativedelta
from github import Github

## microGalaxy GTN Youtube Videos Stats Extraction

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from googleapiclient.discovery import build
import json

# Function to fetch and parse the HTML content
def fetch_and_parse_html(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

# Function to extract table data and create a DataFrame
def extract_table_data(soup):
    tables = soup.find_all('table')
    
    all_rows = []
    for table in tables:
        headers = [header.text.strip() for header in table.find_all('th')]
        rows = []
        for row in table.find_all('tr')[1:]:
            row_data = [data.text.strip() for data in row.find_all('td')]
            rows.append(row_data)
        all_rows.extend(rows)
    
    df = pd.DataFrame(all_rows, columns=headers)
    return df

# Function to remove 'Hands-on: Hands-on: ' from Lesson column entries
def clean_lesson_column(df):
    df['Lesson'] = df['Lesson'].str.replace('Hands-on: Hands-on: ', '', regex=False)
    df['Lesson'] = df['Lesson'].str.split('\n').str[0]
    return df

# Function to create a boolean column indicating if 'Recordings' is present
def create_recordings_present_column(df):
    df['Recordings_Present'] = df['Recordings'].str.startswith('video\n')
    return df

# Function to get video details for each lesson with channel, title, and order check
def get_video_details(api_key, video_titles, channel_id='UCx3M4_d3M__lXf9OX8I5Kdg', cache_file='video_cache.json'):
    try:
        with open(cache_file, 'r') as file:
            cache = json.load(file)
    except FileNotFoundError:
        cache = {}

    youtube = build('youtube', 'v3', developerKey=api_key)
    video_details = {}

    for title in video_titles:
        if title in cache:
            video_details[title] = cache[title]
        else:
            # Print the query
            query = f'{title} GalaxyProject site:youtube.com'
            print(f'Query: {query}')

            # Execute the YouTube API request
            request = youtube.search().list(
                part='id,snippet',
                q=query,
                type='video',
                maxResults=1
            )
            response = request.execute()

            items = response.get('items', [])
            if items:
                snippet = items[0]['snippet']
                video_id = items[0]['id']['videoId']
                video_url = f'https://www.youtube.com/watch?v={video_id}'
               

                # Check if the video is from the specified channel and has a title exactly matching the Lesson entry
                if snippet['channelId'] == channel_id and title.lower() in snippet['title'].lower():
                    video_details[title] = {
                        'Video_Count': int(snippet.get('statistics', {}).get('viewCount', 0)),
                        'Published_Date': snippet.get('publishedAt', ''),
                        'Video_ID': video_id,
                        'Video_URL': video_url
                    }
                else:
                    # Video doesn't meet the criteria, set Video_Count to None
                    video_details[title] = {
                        'Video_Count': None,
                        'Published_Date': '',
                        'Video_ID': None,
                        'Video_URL': None
                    }

                # Update cache
                cache[title] = video_details[title]

    # Save updated cache to file
    with open(cache_file, 'w') as file:
        json.dump(cache, file)

    return video_details

# Main function to perform the tasks
def main():
    url = 'https://training.galaxyproject.org/training-material/tags/microgalaxy/embed.html'
    api_key = '' #'YOUR_YOUTUBE_API_KEY'

    # Fetch and parse HTML
    soup = fetch_and_parse_html(url)
    
    # Extract table data and create a DataFrame
    df = extract_table_data(soup)
    
    # Clean the 'Lesson' column
    df_cleaned_lesson = clean_lesson_column(df)
    
    # Create a boolean column indicating if 'Recordings' is present
    df_final = create_recordings_present_column(df_cleaned_lesson)
    
    # Get video details for each lesson with channel, title, and order check
    video_titles = df_final['Lesson'].tolist()
    video_details = get_video_details(api_key, video_titles)
    
    # Create new columns 'Video_Count', 'Published_Date', 'Video_ID', and 'Video_URL'
    df_final['Video_Count'] = df_final['Lesson'].apply(lambda title: video_details.get(title, {}).get('Video_Count'))
    df_final['Published_Date'] = df_final['Lesson'].apply(lambda title: video_details.get(title, {}).get('Published_Date'))
    df_final['Video_ID'] = df_final['Lesson'].apply(lambda title: video_details.get(title, {}).get('Video_ID'))
    df_final['Video_URL'] = df_final['Lesson'].apply(lambda title: video_details.get(title, {}).get('Video_URL'))
    
    # Drop duplicates based on the 'Lesson' column
    df_final = df_final.drop_duplicates(subset=['Lesson'])

    # Keep only the desired columns
    df_final = df_final[['Lesson', 'Recordings_Present', 'Video_Count', 'Published_Date', 'Video_ID', 'Video_URL']]
   
    # Display the final DataFrame
    display(df_final)

# Execute the main function
main()


Unnamed: 0,Lesson,Recordings_Present,Video_Count,Published_Date,Video_ID,Video_URL
0,Assembly of metagenomic sequencing data,False,25425,2018-06-08T19:01:37Z,,
1,Genome Assembly of a bacterial genome (MRSA) s...,False,11068,2018-02-03T11:26:24Z,,
2,Making sense of a newly assembled genome,False,4147,2021-09-14T10:00:12Z,,
3,Unicycler Assembly,False,522,2023-01-09T09:18:24Z,,
4,Identifying tuberculosis transmission links: f...,False,4408,2018-11-09T16:01:32Z,,
5,Tree thinking for tuberculosis evolution and e...,False,3841,2020-08-20T12:47:28Z,,
6,Bacterial Genome Annotation,False,6488,2012-03-09T10:52:48Z,,
7,Genome annotation with Prokka,True,7747,2021-01-01T13:27:30Z,,
8,Refining Genome Annotations with Apollo (proka...,True,219,2021-06-25T16:13:31Z,,
9,Comparative gene analysis in unannotated genomes,False,2118,2023-04-05T21:08:21Z,,


## microGalaxy GTN visitors and feedback (Plausible)

In [5]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote

# URL of the training page
training_url = "https://training.galaxyproject.org/training-material/tags/microgalaxy/embed.html"

# Send a GET request to the URL
response = requests.get(training_url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Extract lesson links
lesson_links = [a['href'] for a in soup.find_all('a', href=True)]

# Base URL for plausible
plausible_base_url = "https://plausible.galaxyproject.eu"

# Filter only the lesson links that end with "tutorial.html"
lesson_links = [link for link in lesson_links if link.endswith("tutorial.html")]

# Use a set to store unique plausible links
unique_plausible_links = set()

# Generate plausible links for the filtered lesson links and add to the set
for link in lesson_links:
    plausible_link = f"{plausible_base_url}/api/v1/stats/url?site_id=training.galaxyproject.org&period=all&page={quote(link)}&metrics=pageviews,visitors"
    unique_plausible_links.add(plausible_link)

# Replace "YOUR_PLAUSIBLE_API_KEY" with your actual Plausible API key
api_key = ""

# Fetch and print the metrics for each unique plausible link
for link in unique_plausible_links:
    # Properly encode the URL
    encoded_link = quote(link, safe=':/?&=')  # Adding more characters to the safe parameter

    # Make the request with the encoded URL
    response = requests.get(encoded_link, headers={"Authorization": f"Bearer {api_key}"})
    if response.status_code == 200:
        data = response.json()
        print(f"For {link}, Metrics: {data['results']}")
    else:
        print(f"Error fetching Plausible metrics for {link}: {response.status_code}")



Error fetching Plausible metrics for https://plausible.galaxyproject.eu/api/v1/stats/url?site_id=training.galaxyproject.org&period=all&page=/training-material/topics/genome-annotation/tutorials/bacterial-genome-annotation/tutorial.html&metrics=pageviews,visitors: 404
Error fetching Plausible metrics for https://plausible.galaxyproject.eu/api/v1/stats/url?site_id=training.galaxyproject.org&period=all&page=/training-material/topics/proteomics/tutorials/metaquantome-taxonomy/tutorial.html&metrics=pageviews,visitors: 404
Error fetching Plausible metrics for https://plausible.galaxyproject.eu/api/v1/stats/url?site_id=training.galaxyproject.org&period=all&page=/training-material/topics/assembly/tutorials/metagenomics-assembly/tutorial.html&metrics=pageviews,visitors: 404
Error fetching Plausible metrics for https://plausible.galaxyproject.eu/api/v1/stats/url?site_id=training.galaxyproject.org&period=all&page=/training-material/topics/proteomics/tutorials/metaquantome-function/tutorial.html&m

### Generating Plausible links for every training on GTN with microGalaxy tag

In [51]:
# URL of the training page
training_url = "https://training.galaxyproject.org/training-material/tags/microgalaxy/embed.html"

# Send a GET request to the URL
response = requests.get(training_url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Extract lesson links
lesson_links = [a['href'] for a in soup.find_all('a', href=True)]

# Base URL for plausible
plausible_base_url = "https://plausible.galaxyproject.eu/training.galaxyproject.org?period=all&page="

# Filter only the lesson links that end with "tutorial.html"
lesson_links = [link for link in lesson_links if link.endswith("tutorial.html")]

# Use a set to store unique plausible links
unique_plausible_links = set()

# Generate plausible links for the filtered lesson links and add to the set
for link in lesson_links:
    plausible_link = plausible_base_url + quote(link)
    unique_plausible_links.add(plausible_link)

# Print the unique plausible links
for link in unique_plausible_links:
    print(link)

https://plausible.galaxyproject.eu/training.galaxyproject.org?period=all&page=/training-material/topics/genome-annotation/tutorials/apollo/tutorial.html
https://plausible.galaxyproject.eu/training.galaxyproject.org?period=all&page=/training-material/topics/microbiome/tutorials/mothur-miseq-sop/tutorial.html
https://plausible.galaxyproject.eu/training.galaxyproject.org?period=all&page=/training-material/topics/microbiome/tutorials/general-tutorial/tutorial.html
https://plausible.galaxyproject.eu/training.galaxyproject.org?period=all&page=/training-material/topics/genome-annotation/tutorials/bacterial-genome-annotation/tutorial.html
https://plausible.galaxyproject.eu/training.galaxyproject.org?period=all&page=/training-material/topics/assembly/tutorials/mrsa-illumina/tutorial.html
https://plausible.galaxyproject.eu/training.galaxyproject.org?period=all&page=/training-material/topics/proteomics/tutorials/metaquantome-taxonomy/tutorial.html
https://plausible.galaxyproject.eu/training.galax