## Imports

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import requests
from bs4 import BeautifulSoup
from googleapiclient.discovery import build
import json
from dateutil import relativedelta
from github import Github
import requests
from urllib.parse import quote
import os
import zipfile
from io import BytesIO
from datetime import datetime

## Important Links and APIs

In [15]:
training_url = 'https://training.galaxyproject.org/training-material/tags/microgalaxy/embed.html'
training_material_api = ''
youtube_api_key = '' #'YOUR_YOUTUBE_API_KEY'
plausible_api_key = ""  #'YOUR_PLAUSIBLE_API_KEY'


## microGalaxy GTN Youtube Videos Stats Extraction

### Functions extracting microGalaxy training material names, information and plausible links

In [18]:
# Function to fetch and parse the HTML content
def fetch_and_parse_html(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

# Function to extract table data and create a DataFrame
def extract_table_data(soup):
    tables = soup.find_all('table')
    
    all_rows = []
    for table in tables:
        headers = [header.text.strip() for header in table.find_all('th')]
        rows = []
        for row in table.find_all('tr')[1:]:
            row_data = [data.text.strip() for data in row.find_all('td')]
            rows.append(row_data)
        all_rows.extend(rows)
    
    df = pd.DataFrame(all_rows, columns=headers)
    return df

# Function to remove 'Hands-on: Hands-on: ' from Lesson column entries
def clean_lesson_column(df):
    df['Lesson'] = df['Lesson'].str.replace('Hands-on: Hands-on: ', '', regex=False)
    df['Lesson'] = df['Lesson'].str.split('\n').str[0]
    return df

# Function to create a boolean column indicating if 'Recordings' is present
def create_recordings_present_column(df):
    df['Recordings_Present'] = df['Recordings'].str.startswith('video\n')
    return df

### Functions to extract youtube stats, checking the GalaxyProject channel, for every microGalaxy training and return the video date and number of views

In [19]:
# Function to get video details for each lesson with channel, title, and order check
def get_video_details(api_key, video_titles, channel_id='UCx3M4_d3M__lXf9OX8I5Kdg', cache_file='video_cache.json'):
    try:
        with open(cache_file, 'r') as file:
            cache = json.load(file)
    except FileNotFoundError:
        cache = {}

    youtube = build('youtube', 'v3', developerKey=api_key)
    video_details = {}

    for title in video_titles:
        if title in cache:
            video_details[title] = cache[title]
        else:
            # Print the query
            query = f'{title} GalaxyProject site:youtube.com'
            print(f'Query: {query}')

            # Execute the YouTube API request
            request = youtube.search().list(
                part='id,snippet',
                q=query,
                type='video',
                maxResults=1
            )
            response = request.execute()

            items = response.get('items', [])
            if items:
                snippet = items[0]['snippet']
                video_id = items[0]['id']['videoId']
                video_url = f'https://www.youtube.com/watch?v={video_id}'
               

                # Check if the video is from the specified channel and has a title exactly matching the Lesson entry
                if snippet['channelId'] == channel_id and title.lower() in snippet['title'].lower():
                    video_details[title] = {
                        'Video_Count': int(snippet.get('statistics', {}).get('viewCount', 0)),
                        'Published_Date': snippet.get('publishedAt', ''),
                        'Video_ID': video_id,
                        'Video_URL': video_url
                    }
                else:
                    # Video doesn't meet the criteria, set Video_Count to None
                    video_details[title] = {
                        'Video_Count': None,
                        'Published_Date': '',
                        'Video_ID': None,
                        'Video_URL': None
                    }

                # Update cache
                cache[title] = video_details[title]

    # Save updated cache to file
    with open(cache_file, 'w') as file:
        json.dump(cache, file)

    return video_details

# Adding youtube videos statistics
def creating_trainings_videos_stats():

    # Fetch and parse HTML
    soup = fetch_and_parse_html(training_url)
    
    # Extract table data and create a DataFrame
    df = extract_table_data(soup)
    
    # Clean the 'Lesson' column
    df_cleaned_lesson = clean_lesson_column(df)
    
    # Create a boolean column indicating if 'Recordings' is present
    df_final = create_recordings_present_column(df_cleaned_lesson)
    
    # Get video details for each lesson with channel, title, and order check
    video_titles = df_final['Lesson'].tolist()
    video_details = get_video_details(youtube_api_key, video_titles)
    
    # Create new columns 'Video_Count', 'Published_Date', 'Video_ID', and 'Video_URL'
    df_final['Video_Count'] = df_final['Lesson'].apply(lambda title: video_details.get(title, {}).get('Video_Count'))
    df_final['Published_Date'] = df_final['Lesson'].apply(lambda title: video_details.get(title, {}).get('Published_Date'))
    df_final['Video_ID'] = df_final['Lesson'].apply(lambda title: video_details.get(title, {}).get('Video_ID'))
    df_final['Video_URL'] = df_final['Lesson'].apply(lambda title: video_details.get(title, {}).get('Video_URL'))
    
    # Drop duplicates based on the 'Lesson' column
    df_final = df_final.drop_duplicates(subset=['Lesson'])

    # Keep only the desired columns
    df_final = df_final[['Lesson', 'Recordings_Present', 'Video_Count', 'Published_Date', 'Video_ID', 'Video_URL']]
   
    # Display the final DataFrame
    display(df_final)

# Creating training videos statisitics table
creating_trainings_videos_stats()


Query: QIIME 2 Cancer Microbiome Intervention external-link GalaxyProject site:youtube.com


Unnamed: 0,Lesson,Recordings_Present,Video_Count,Published_Date,Video_ID,Video_URL
0,Assembly of metagenomic sequencing data,False,,,,
1,Genome Assembly of a bacterial genome (MRSA) s...,False,,,,
2,Making sense of a newly assembled genome,False,,,,
3,Unicycler Assembly,False,,,,
4,Identifying tuberculosis transmission links: f...,False,,,,
5,Tree thinking for tuberculosis evolution and e...,False,,,,
6,Bacterial Genome Annotation,False,,,,
7,Genome annotation with Prokka,True,,,,
8,Refining Genome Annotations with Apollo (proka...,True,,,,
9,Comparative gene analysis in unannotated genomes,False,,,,


## microGalaxy GTN visitors and feedback (Plausible)

In [21]:
# Get the current date
current_date = datetime.now().strftime('%Y-%m-%d')

# Send a GET request to the URL
response = requests.get(training_url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Extract lesson links
lesson_links = [a['href'] for a in soup.find_all('a', href=True)]

# Base URL for plausible
plausible_base_url = "https://plausible.galaxyproject.eu"

# Filter only the lesson links that end with "tutorial.html"
lesson_links = [link for link in lesson_links if link.endswith("tutorial.html")]

# Dictionary to store plausible download links for each lesson
plausible_download_links = {}

# Rest of the code remains unchanged...

# Generate plausible download links for the filtered lesson links using the current date
for link in lesson_links:
    plausible_download_link = f"{plausible_base_url}/training.galaxyproject.org/export?period=all&date={current_date}&filters=%7B%22page%22%3A%22{quote(link)}%22%7D&with_imported=true&interval=month"
    plausible_download_links[link] = plausible_download_link


# Directory to store downloaded zip files
download_directory = "plausible_zips"

# Create the directory if it doesn't exist
os.makedirs(download_directory, exist_ok=True)

# Iterate through each lesson and download the plausible zip file
for lesson, download_link in plausible_download_links.items():
    # Properly encode the URL
    encoded_link = quote(download_link, safe=':/?&=')

    # Specify cache control headers to avoid caching
    headers = {
        "Authorization": f"Bearer {plausible_api_key}",
        "Cache-Control": "no-cache, no-store, must-revalidate",
        "Pragma": "no-cache",
        "Expires": "0"
    }

    # Make the request with the encoded URL and cache control headers
    response = requests.get(encoded_link, headers=headers)
    
    if response.status_code == 200:
        # Generate a unique name for the zip file
        zip_filename = f"{lesson.replace('/', '_')}_{current_date}_plausible.zip"
        zip_filepath = os.path.join(download_directory, zip_filename)
        
        # Save the zip file with the unique filename
        with open(zip_filepath, 'wb') as zip_file:
            zip_file.write(response.content)
        
        # Extract visitors.csv from the zip file
        with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
            # Get the name of the visitors.csv file in the zip
            csv_name = [name for name in zip_ref.namelist() if name.lower().endswith('visitors.csv')][0]
            
            # Generate a new name for the visitors.csv file
            new_csv_name = f"{lesson.replace('/', '_')}_{current_date}_visitors.csv"
            
            # Extract the visitors.csv file with the new name
            zip_ref.extract(csv_name, download_directory)
            
            # Rename the visitors.csv file
            os.rename(os.path.join(download_directory, csv_name), os.path.join(download_directory, new_csv_name))
            
            # Read visitors.csv into a DataFrame
            visitors_df = pd.read_csv(os.path.join(download_directory, new_csv_name))
        
        # Print the number of visitors for the lesson
        print(f"For {lesson}, Number of Visitors: {visitors_df['visitors'].sum()}")
    else:
        print(f"Error fetching Plausible metrics for {lesson}: {response.status_code}")

For /training-material/topics/assembly/tutorials/metagenomics-assembly/tutorial.html, Number of Visitors: 1795696
For /training-material/topics/assembly/tutorials/mrsa-illumina/tutorial.html, Number of Visitors: 1795696
For /training-material/topics/assembly/tutorials/ecoli_comparison/tutorial.html, Number of Visitors: 1795696
For /training-material/topics/assembly/tutorials/unicycler-assembly/tutorial.html, Number of Visitors: 1795696
For /training-material/topics/evolution/tutorials/mtb_transmission/tutorial.html, Number of Visitors: 1795696
For /training-material/topics/evolution/tutorials/mtb_phylogeny/tutorial.html, Number of Visitors: 1795697
For /training-material/topics/genome-annotation/tutorials/bacterial-genome-annotation/tutorial.html, Number of Visitors: 1795697
For /training-material/topics/genome-annotation/tutorials/annotation-with-prokka/tutorial.html, Number of Visitors: 1795697
For /training-material/topics/genome-annotation/tutorials/apollo/tutorial.html, Number of 

### Generating Plausible links for every training on GTN with microGalaxy tag

In [51]:
# URL of the training page
training_url = "https://training.galaxyproject.org/training-material/tags/microgalaxy/embed.html"

# Send a GET request to the URL
response = requests.get(training_url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Extract lesson links
lesson_links = [a['href'] for a in soup.find_all('a', href=True)]

# Base URL for plausible
plausible_base_url = "https://plausible.galaxyproject.eu/training.galaxyproject.org?period=all&page="

# Filter only the lesson links that end with "tutorial.html"
lesson_links = [link for link in lesson_links if link.endswith("tutorial.html")]

# Use a set to store unique plausible links
unique_plausible_links = set()

# Generate plausible links for the filtered lesson links and add to the set
for link in lesson_links:
    plausible_link = plausible_base_url + quote(link)
    unique_plausible_links.add(plausible_link)

# Print the unique plausible links
for link in unique_plausible_links:
    print(link)

https://plausible.galaxyproject.eu/training.galaxyproject.org?period=all&page=/training-material/topics/genome-annotation/tutorials/apollo/tutorial.html
https://plausible.galaxyproject.eu/training.galaxyproject.org?period=all&page=/training-material/topics/microbiome/tutorials/mothur-miseq-sop/tutorial.html
https://plausible.galaxyproject.eu/training.galaxyproject.org?period=all&page=/training-material/topics/microbiome/tutorials/general-tutorial/tutorial.html
https://plausible.galaxyproject.eu/training.galaxyproject.org?period=all&page=/training-material/topics/genome-annotation/tutorials/bacterial-genome-annotation/tutorial.html
https://plausible.galaxyproject.eu/training.galaxyproject.org?period=all&page=/training-material/topics/assembly/tutorials/mrsa-illumina/tutorial.html
https://plausible.galaxyproject.eu/training.galaxyproject.org?period=all&page=/training-material/topics/proteomics/tutorials/metaquantome-taxonomy/tutorial.html
https://plausible.galaxyproject.eu/training.galax