In [30]:
import pandas as pd
from collections import Counter

Loading DataFrame

In [2]:
DF_FILE_NAME = '../data/STR_Arabidopsis.dataframe.annotated.tsv'
header = ['GENE', 'START', 'END', 'ACCESSION', 'DEPTH', 'UNIT', 'UNIT_N', 'REF', 'ALTVAR1', 'LEN_ALTVAR1', 'ALTVAR2', 'LEN_ALTVAR2']

STRs = pd.read_csv(DF_FILE_NAME, sep = '\t', na_filter = False,
                     header = None, index_col = None, 
                     names = header)

STRs = STRs.reset_index().rename(columns = {'level_0' : 'CHR'}).drop(['level_1'], axis = 1)

Loading in metadata from SRA (https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRP056687) and table at http://1001genomes.org/accessions.html

In [3]:
AccList = pd.read_csv("../data/AccListFrom1001Arabidopsis.csv")
AccList.rename(columns={'tg_ecotypeid': 'Ecotype'}, inplace=True)

SraRunTable = pd.read_csv("../data/SraRunTable.txt", sep = "\t")
SraRunTable.rename(columns={'ecotype_s': 'Ecotype'}, inplace=True)

AccListSraRunTable = pd.merge(AccList, SraRunTable)

Adding 'CHR_START' column as all positions are not unique

In [4]:
start_chrom = []
for i in STRs[['START', 'CHR']].drop_duplicates().itertuples():
    start_chrom.append(i.CHR + '_' + str(i.START))
    
frame = STRs[['START', 'CHR']].drop_duplicates()
frame['CHR_START'] = start_chrom

STRs = pd.merge(frame, STRs)

Filtering for depth

In [34]:
STRs_depth5 = STRs[STRs['DEPTH']>=5]

How many do we lose?

In [6]:
n_all = len(set(STRs.CHR_START))
n_depth5 = len(set(STRs_depth5.CHR_START))
print "Number of STRs with no depth filtering:", n_all
print "Number of STRs with depth > 5:", n_depth5
print "Loci not analyzed further:", n_all - n_depth5

Number of STRs with no depth filtering: 19160
Number of STRs with depth > 5: 18835
Loci not analyzed further: 325


Defining STR dosage as the length of both alleles of the STR loci

In [35]:
STRs_depth5['STR_DOSAGE'] = STRs_depth5.LEN_ALTVAR1 + STRs_depth5.LEN_ALTVAR2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Writing to disk

In [51]:
STRs_depth5.to_csv('../data/STRs_depth5.tsv', sep = '\t')

Filtering for major allele frequency <= 0.9

In [48]:
grouped = STRs_depth5.groupby('CHR_START')

mafs = []
passed = []
for i in grouped:
    data = Counter(i[1].LEN_ALTVAR1)
    maf = max(data.values()) / float(sum(data.values()))
    mafs.append(maf)
    if maf <= 0.9:
        passed.append(i[0])
        
STRs_depth5_chr_start_maf09 = STRs_depth5[STRs_depth5['CHR_START'].isin(passed)]

In [59]:
print len(set(STRs_depth5.CHR_START))

18835


Frame with non-variable repeats by this measure

In [60]:
STRs_depth5_chr_start_maf09_nv = STRs_depth5[~STRs_depth5['CHR_START'].isin(passed)]
STRs_depth5_chr_start_maf09_nv.to_csv('../data/STRs_depth5_chr_start_maf09_nv.tsv', sep = '\t', index=None)              

Writing frame for all further analyses

In [65]:
STRs_depth5_chr_start_maf09.to_csv('../data/STRs_depth5_chr_start_maf09.tsv', sep = '\t', index=None)