## UniProt Statistics

In [85]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from matplotlib.gridspec import GridSpec
from textwrap import wrap


def format_number(num):
    for unit, divisor in [('B', 1e9), ('M', 1e6), ('K', 1e3)]:
        if num >= divisor:
            scaled = num / divisor
            if scaled >= 100:
                return f"{scaled:.0f}{unit}"
            else:
                return f"{scaled:.1f}{unit}"
    return f"{num}"

def format_tick_label(label, chars_per_line=20):
    return '\n'.join(wrap(label, chars_per_line))

# Data
species_entries = {
    'Chloroflexota bacterium': 1_649_388,
    'Gammaproteobacteria bacterium': 1_571_227,
    'Acidobacteriota bacterium': 1_301_645,
    'Deltaproteobacteria bacterium': 1_180_560,
    'Human immunodeficiency virus 1': 1_107_809,
    'Actinomycetes bacterium': 853_379,
    'Alphaproteobacteria bacterium': 799_851,
    'Planctomycetota bacterium': 725_306,
    'marine sediment metagenome': 668_601,
    'Verrucomicrobiota bacterium': 640_596
}

species_data = {'TrEMBL': 1_312_332, 'SwissProt': 14_652}

species_sequences = {
    "Eukaryota": 79_048_440,
    "Archaea": 6_676_975,
    "Viruses": 5_689_440,
    "Bacteria": 152_054_072,
    "Other": 2_427_836
}

protein_existence = {
    'Protein level': 391_016,
    'Transcript level': 1_416_119,
    'Homology': 88_949_322,
    'Predicted': 155_138_483,
    'Uncertain': 1_826
}

# Set up the plot style
sns.set(style="whitegrid")
plt.rcParams.update({'font.size': 12})

# Create a figure with custom layout
fig = plt.figure(figsize=(20, 10))
gs = GridSpec(3, 2, figure=fig, width_ratios=[1, 1], height_ratios=[1, 1, 1])

# Adjust the layout
ax1 = fig.add_subplot(gs[:, 0])  # Left column, full height
ax2 = fig.add_subplot(gs[0, 1])  # Top right
ax3 = fig.add_subplot(gs[1, 1])  # Middle right
ax4 = fig.add_subplot(gs[2, 1])  # Bottom right

fig.suptitle('UniProt Statistics', fontsize=24)

# --- a. Horizontal bar plot for Species Entries ---
species = list(species_entries.keys())
values = list(species_entries.values())
y_pos = np.arange(len(species))

sns.barplot(x=values, y=species, ax=ax1, color='skyblue')
ax1.set_title('a. Number of Top 10 Entries by Species', fontsize=20)
ax1.set_xlabel('Number of Entries', fontsize=16)
ax1.set_xlim(0, 1_800_000)
ax1.set_ylabel('')
ax1.tick_params(labelsize=14)
formatted_species = [format_tick_label(s) for s in species]
ax1.set_yticks(y_pos)
ax1.set_yticklabels(formatted_species)
for i, v in enumerate(values):
    ax1.text(v, i, f' {format_number(v)}', va='center', fontsize=14)

# Function to create a single stacked horizontal bar plot
def create_single_stacked_bar(ax, data, title):
    df = pd.DataFrame(list(data.items()), columns=['Category', 'Value'])
    df['Percentage'] = df['Value'] / df['Value'].sum() * 100
    df = df.sort_values('Percentage', ascending=False)

    colors = sns.color_palette("husl", n_colors=len(df))
    ax.barh(y=0.75, width=df['Percentage'], left=df['Percentage'].cumsum() - df['Percentage'], height=0.3, color=colors)
    ax.set_title(title, fontsize=20)
    ax.set_xlim(0, 100)
    ax.set_ylim(0, 1)
    ax.set_yticks([])
    ax.set_xticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')

    # Remove axes
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)

    # Create legend with all information
    legend_labels = [f"{row['Category']}: {row['Percentage']:.1f}% ({format_number(row['Value'])})" for _, row in df.iterrows()]
    legend_elements = [plt.Rectangle((0,0),1,1, facecolor=color) for color in colors]
    ax.legend(legend_elements, legend_labels, loc='center',
              bbox_to_anchor=(0.5, 0.3), ncol=2, fontsize=16)

# --- b. Single stacked horizontal bar for TrEMBL vs SwissProt ---
create_single_stacked_bar(ax2, species_data, 'b. Distribution of Entries: TrEMBL vs SwissProt')

# --- c. Single stacked horizontal bar for Species Sequences ---
create_single_stacked_bar(ax3, species_sequences, 'c. Distribution of Species Sequences')

# --- d. Single stacked horizontal bar for Protein Existence ---
create_single_stacked_bar(ax4, protein_existence, 'd. Protein Existence Evidence')

plt.tight_layout()
plt.savefig('uniprot_stats.png', dpi=600, bbox_inches='tight')
plt.close()

print("Combined plot with new layout has been generated and saved as 'uniprot_stats.png'.")

Combined plot with new layout has been generated and saved as 'uniprot_stats.png'.


In [89]:
sum(species_entries.values()) / sum(species_sequences.values())

0.04269418544562134

## PDB Statistics

In [136]:
import matplotlib.pyplot as plt
import numpy as np

# Data for the pie chart
labels = ['X-ray', 'EM', 'NMR', 'Multiple methods', 'Neutron', 'Other']
sizes = [187055, 21796, 14326, 236, 82, 37]

# Sort the data by size in descending order
sorted_indices = np.argsort(sizes)[::-1]
sorted_sizes = [sizes[i] for i in sorted_indices]
sorted_labels = [f'{labels[i]} ({sizes[i]:,} - {sizes[i]/sum(sizes)*100:.1f}%)' for i in sorted_indices]

# Create the pie chart with sorted data
plt.figure(figsize=(8.5, 6))
plt.pie(sorted_sizes, labels=None, startangle=90,
        colors=plt.cm.tab20.colors, wedgeprops={'edgecolor': 'none'})  # Choose better colors

# Add legend in descending order
plt.legend(sorted_labels, title="Molecular Type", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1), title_fontsize=18, fontsize=16)

# Customize the title
plt.title('Protein Structures by Molecular Type in PDB Data', fontsize=20, ha='center')

# Show the plot
# plt.show()
# plt.tight_layout()
plt.tight_layout(rect=[0, 0, 0.85, .75])
plt.savefig('PDB_statistics.png', dpi=300, bbox_inches='tight')
plt.close()