In [None]:
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

In [None]:
cols = ["id", "length", "offset", "line_bases", "line_width"]
df = pl.scan_csv(snakemake.input['index'], separator="\t", 
                 has_header=False, new_columns=cols)

In [None]:
df = df.select(pl.col("length")).collect().to_pandas()
df['$log_{10}(length)$'] = np.log10(df['length'])

In [None]:
plt.figure(figsize=(5, 5), dpi=300)
sns.histplot(data=df, x='$log_{10}(length)$', bins=50)
plt.ylabel('Number of Proteins')
plt.text(.1, .9, f"Median: {df['length'].median():.2f}",
         transform=plt.gca().transAxes)
plt.text(.1, .85, f"Q1: {df['length'].quantile(0.25):.2f}",
         transform=plt.gca().transAxes)
plt.text(.1, .8, f"Q3: {df['length'].quantile(0.75):.2f}", 
         transform=plt.gca().transAxes)
plt.yscale('log')
plt.savefig(snakemake.output['fig_hist'], bbox_inches='tight', dpi=300, transparent=True)

In [None]:
plt.figure(figsize=(5, 5), dpi=300)
sns.ecdfplot(data=df, x='$log_{10}(length)$')
plt.ylabel('Proportion of Proteins')
# percent of protein longer than 1000
plt.text(.7, .6, "$Length > 1000$ \n" f"${100 * np.mean(df['length'] > 1000):.2f}\%$ \n" f"$n={np.sum(df['length'] > 1000)}$",
         transform=plt.gca().transAxes)
plt.axvline(3, color='red', linestyle='--')
plt.savefig(snakemake.output['fig_cdf'], bbox_inches='tight', dpi=300, transparent=True)