# Table of Contents
 <p><div class="lev1"><a href="#Imports"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports</a></div><div class="lev1"><a href="#Paths"><span class="toc-item-num">2&nbsp;&nbsp;</span>Paths</a></div><div class="lev1"><a href="#Functions"><span class="toc-item-num">3&nbsp;&nbsp;</span>Functions</a></div><div class="lev1"><a href="#Fooling-around"><span class="toc-item-num">4&nbsp;&nbsp;</span>Fooling around</a></div><div class="lev2"><a href="#How-many-base-pairs-do-we-get-with-various-end-lengths?"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>How many base-pairs do we get with various end lengths?</a></div>

# Imports

In [3]:
from pyfaidx import Fasta

import pandas as pd

# Paths

In [4]:
fas_ = "/run/media/gus/Storage/louise/data/genomes/glossina_fuscipes/assemblies/GfusI1/Glossina-fuscipes-IAEA_SCAFFOLDS_GfusI1.renamed.fa"

# Functions

In [15]:
def make_ends_bed_recs(fasta, window, excluded):
    """Use the scaffold fasta to produce a bed file representing the ends of each scaffold.

    Scaffolds are only included if they are at least as 2x as long as the supplied window.
    """
    for name, seq in fasta.records.items():
        seq_range = len(seq)

        chrom = name
        startL = 0
        endL = window
        startR = seq_range - window
        endR = seq_range

        if seq_range < (window * 2) + 1:
            excluded.append(chrom)
            print(" {chrom} with length {range} was excluded from bed.".format(chrom=chrom,
                                                                                     range=seq_range))
            continue

        yield "{chrom}\t{startL}\t{endL}\n{chrom}\t{startR}\t{endR}\n".format(chrom=chrom,
                                                                        startL=startL,
                                                                        endL=endL,
                                                                        startR=startR,
                                                                        endR=endR,
                                                                       )

# Fooling around

In [16]:
fas = Fasta(fas_)

In [17]:
len(fas.records)

2395

In [18]:
# for k in list(fas.keys())[:10]:
#     print("{key}: {leng}".format(key=k, leng=len(fas[k][:].seq)))

In [28]:
1200/2395

0.5010438413361169

In [27]:
len(excluded)

1200

In [10]:
excluded

['Scaffold1311',
 'JFJR01012682',
 'JFJR01013424',
 'JFJR01012647',
 'JFJR01013313']

In [24]:
for name, seq in fas.records.items():
        seq_range = len(seq)
        print(name+ " " + str(seq_range))
        break

Scaffold607 162067


In [13]:
x

'Scaffold607'

## How many base-pairs do we get with various end lengths?

In [29]:
def scaff_lengths(fasta):
    lengths = {}
    for name, seq in fasta.records.items():
        lengths[name] = len(seq)
        
    return lengths

In [30]:
lengths = scaff_lengths(fas)

In [31]:
len(lengths)

2395

In [34]:
total_bp = sum(lengths.values())
total_bp

374774708

In [38]:
win10K_bp = sum([len(l) for l in fas.records.values() if len(l) >= 10000*2+1])
win10K_bp

370538898

In [40]:
win10K_bp / total_bp

0.9886977164958528