# Accessing the LLS dataset

In [1]:
import biu

## Initializing and examining the structure
The phenotypes data file is not located in a reasonable location. We must provide our own local copy of it.

In [2]:
lls = biu.db.LLS(localCopy={"phen" : "/home/tgehrmann/repos/VAR/phen218.txt"})
print(lls)

LLS object
 Where: /exports/molepi/LLSSEQ
 Version: current
 Objects:
  * [ ] phenotypes
  * [ ] vcf[1]
  * [ ] vcf[2]
  * [ ] vcf[3]
  * [ ] vcf[4]
  * [ ] vcf[5]
  * [ ] vcf[6]
  * [ ] vcf[7]
  * [ ] vcf[8]
  * [ ] vcf[9]
  * [ ] vcf[10]
  * [ ] vcf[11]
  * [ ] vcf[12]
  * [ ] vcf[13]
  * [ ] vcf[14]
  * [ ] vcf[15]
  * [ ] vcf[16]
  * [ ] vcf[17]
  * [ ] vcf[18]
  * [ ] vcf[19]
  * [ ] vcf[20]
  * [ ] vcf[21]
  * [ ] vcf[22]
  * [ ] vcf[M]
  * [ ] vcf[X]
  * [ ] vcf[Y]
 Files:
  * [X] vcf_1 : /exports/molepi/LLSSEQ/tbx/merged.chr1.vcf.bgz
  * [X] vcf_1_tbi : /exports/molepi/LLSSEQ/tbx/merged.chr1.vcf.bgz.tbi
  * [X] vcf_2 : /exports/molepi/LLSSEQ/tbx/merged.chr2.vcf.bgz
  * [X] vcf_2_tbi : /exports/molepi/LLSSEQ/tbx/merged.chr2.vcf.bgz.tbi
  * [X] vcf_3 : /exports/molepi/LLSSEQ/tbx/merged.chr3.vcf.bgz
  * [X] vcf_3_tbi : /exports/molepi/LLSSEQ/tbx/merged.chr3.vcf.bgz.tbi
  * [X] vcf_4 : /exports/molepi/LLSSEQ/tbx/merged.chr4.vcf.bgz
  * [X] vcf_4_tbi : /exports/molepi/LLSSEQ/tbx/mer

D: Could not make Symbolic link for 'phen'. Rewriting internal location.


## Querying the structure

With the exception of the list of participants, the data held in LLS are tabix indexed VCF files. Thus, we can make use of the `query` and `queryRegions` functionalities of the VCF structures.


### Examining the list of participants
The `phenotypes` object of the lls structure describes the list of participants

In [3]:
lls.phenotypes[(lls.phenotypes["sex"] == "MALE") & (lls.phenotypes["age"] > 100) ]

D: Initializing the TSVResourceManager object NOW


Unnamed: 0,cgID,cgID2,LLnr,labnr,age,sex
1,GS00354-DNA_B01,GS000002814-ASM,005.2.01.0,6,100.974675,MALE
52,GS00354-DNA_F07,GS000002795-ASM,112.2.02.0,695,102.193018,MALE
188,GS00398-DNA_C02,GS000003087-ASM,429.2.01.0,2822,100.517454,MALE
196,GS00398-DNA_B01,GS000002935-ASM,441.2.02.0,2740,101.141684,MALE
211,GS00398-DNA_C03,GS000003081-ASM,468.2.02.0,2895,100.183436,MALE


### Querying the variants

In [4]:
for record in lls.query(1, 1000000, 1001000, types=['snp']):
    print(record)

D: Initializing the VCFResourceManager object NOW
D: VCF Input source is tabixed file.


Record(CHROM=1, POS=1000760, REF=G, ALT=[A])
Record(CHROM=1, POS=1000797, REF=C, ALT=[T])
Record(CHROM=1, POS=1000854, REF=A, ALT=[C])
Record(CHROM=1, POS=1000857, REF=G, ALT=[T])
Record(CHROM=1, POS=1000894, REF=A, ALT=[T])
Record(CHROM=1, POS=1000902, REF=G, ALT=[A])
Record(CHROM=1, POS=1000910, REF=C, ALT=[T])
Record(CHROM=1, POS=1000987, REF=T, ALT=[C])
Record(CHROM=1, POS=1000990, REF=T, ALT=[G])


D: VCF Input source is list of Records.


In [5]:
for record in lls.queryRegions([ (1, 1000000, 1001000), (2, 1000000, 1001000)], types=['snp']):
    print(record)

Record(CHROM=1, POS=1000760, REF=G, ALT=[A])
Record(CHROM=1, POS=1000797, REF=C, ALT=[T])
Record(CHROM=1, POS=1000854, REF=A, ALT=[C])
Record(CHROM=1, POS=1000857, REF=G, ALT=[T])
Record(CHROM=1, POS=1000894, REF=A, ALT=[T])
Record(CHROM=1, POS=1000902, REF=G, ALT=[A])
Record(CHROM=1, POS=1000910, REF=C, ALT=[T])
Record(CHROM=1, POS=1000987, REF=T, ALT=[C])
Record(CHROM=1, POS=1000990, REF=T, ALT=[G])
Record(CHROM=2, POS=1000024, REF=T, ALT=[C])
Record(CHROM=2, POS=1000029, REF=G, ALT=[A])
Record(CHROM=2, POS=1000127, REF=C, ALT=[T])
Record(CHROM=2, POS=1000192, REF=A, ALT=[T])
Record(CHROM=2, POS=1000283, REF=C, ALT=[T])
Record(CHROM=2, POS=1000738, REF=A, ALT=[G])
Record(CHROM=2, POS=1000918, REF=T, ALT=[A])


D: VCF Input source is list of Records.
D: Initializing the VCFResourceManager object NOW
D: VCF Input source is tabixed file.
D: VCF Input source is list of Records.


In [6]:
lls.queryRegions([ (1, 1000000, 1001000), (2, 1000000, 1001000)], types=['snp'], extract="summary")

D: VCF Input source is list of Records.


0 1 ['0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '0/0', '

D: VCF Input source is list of Records.


Unnamed: 0,id,RR,R,RA,A,AA,O
0,1-1000760-G-A,221,0,1,0,0,0
1,1-1000797-C-T,218,0,1,0,0,3
2,1-1000854-A-C,214,0,1,0,0,7
3,1-1000857-G-T,212,0,1,0,0,9
4,1-1000894-A-T,214,0,7,0,0,1
5,1-1000902-G-A,221,0,1,0,0,0
6,1-1000910-C-T,172,0,39,0,11,0
7,1-1000987-T-C,221,0,1,0,0,0
8,1-1000990-T-G,221,0,1,0,0,0
9,2-1000024-T-C,212,0,10,0,0,0


### Querying the variants based on specific samples
The sample names provided in the VCF files of LLS are not the same as in the phenotypes file.
The names in the VCF have `'_240_37-ASM'` added at the end. To focus on only those, we must filter on them correctly.

In [7]:
#Select only the samples > 100 years old
relSamples = lls.phenotypes[(lls.phenotypes["sex"] == "MALE") & (lls.phenotypes["age"] > 100) ]
# Attach the postfix to the sample ID
sampleFilters = relSamples["cgID"].apply(lambda x: x + '_240_37-ASM').values
sampleFilters

# Query SNPs in only these individuals
lls.queryRegions([ (1, 1000000, 1001000), (2, 1000000, 1001000)], types=['snp'], 
                 extract="summary", sampleFilters=sampleFilters)


0 1 ['0/0', '0/0', '0/0', '0/0', '0/0']
0 1 ['0/0', '0/0', '0/0', '0/0', '0/0']
0 1 ['0/0', './.', '0/0', '0/0', '0/0']
0 1 ['0/0', './.', '0/0', '0/0', '0/0']
0 1 ['0/0', '0/0', '0/0', '0/0', '0/0']
0 1 ['0/0', '0/0', '0/0', '0/0', '0/0']
0 1 ['0/0', '1/0', '0/0', '1/1', '0/0']
0 1 ['0/0', '0/0', '0/0', '0/0', '0/0']
0 1 ['0/0', '0/0', '0/0', '0/0', '0/0']
0 1 ['0/0', '0/0', '0/0', '0/0', '0/0']
0 1 ['1/0', '1/1', '0/0', '1/0', '1/0']
0 1 ['0/0', '0/0', '1/0', '0/0', '1/0']
0 1 ['0/0', '0/0', '0/0', '0/0', '0/0']
0 1 ['0/0', '1/0', '0/0', '0/0', '0/0']
0 1 ['0/0', '0/0', '0/0', '0/0', '0/0']
0 1 ['0/0', '0/0', '0/0', '0/0', '0/0']


D: VCF Input source is list of Records.
D: VCF Input source is list of Records.


Unnamed: 0,id,RR,R,RA,A,AA,O
0,1-1000760-G-A,5,0,0,0,0,0
1,1-1000797-C-T,5,0,0,0,0,0
2,1-1000854-A-C,4,0,0,0,0,1
3,1-1000857-G-T,4,0,0,0,0,1
4,1-1000894-A-T,5,0,0,0,0,0
5,1-1000902-G-A,5,0,0,0,0,0
6,1-1000910-C-T,3,0,1,0,1,0
7,1-1000987-T-C,5,0,0,0,0,0
8,1-1000990-T-G,5,0,0,0,0,0
9,2-1000024-T-C,5,0,0,0,0,0


### Looking up who has a specific variant

In [11]:
#lls.getVar(1, 1000797, 'T')
[ s.replace('_240_37-ASM', '') for s in lls.whoHas(1, 1000910, 'T') ]

D: VCF Input source is list of Records.


['GS00354-DNA_A02',
 'GS00354-DNA_A10',
 'GS00354-DNA_B07',
 'GS00354-DNA_C07',
 'GS00354-DNA_D04',
 'GS00354-DNA_D06',
 'GS00354-DNA_F06',
 'GS00354-DNA_G01',
 'GS00354-DNA_G02',
 'GS00354-DNA_G04',
 'GS00354-DNA_G11',
 'GS00354-DNA_H02',
 'GS00354-DNA_H05',
 'GS00354-DNA_H09',
 'GS00398-DNA_A02',
 'GS00398-DNA_A05',
 'GS00398-DNA_B02',
 'GS00398-DNA_B03',
 'GS00398-DNA_C02',
 'GS00398-DNA_C06',
 'GS00398-DNA_D01',
 'GS00398-DNA_E05',
 'GS00398-DNA_F02',
 'GS00398-DNA_F04',
 'GS00398-DNA_G01',
 'GS00398-DNA_G04',
 'GS00398-DNA_H03',
 'GS00456-DNA_A01',
 'GS00456-DNA_A05',
 'GS00456-DNA_B02',
 'GS00456-DNA_B07',
 'GS00456-DNA_D01',
 'GS00456-DNA_D08',
 'GS00456-DNA_E08',
 'GS00456-DNA_F08',
 'GS00456-DNA_F10',
 'GS00456-DNA_H01',
 'GS00456-DNA_H02',
 'GS00456-DNA_H03']

In [10]:
lls.phenotypes

Unnamed: 0,cgID,cgID2,LLnr,labnr,age,sex
0,GS00354-DNA_A01,GS000002813-ASM,004.2.01.0,5,96.689938,MALE
1,GS00354-DNA_B01,GS000002814-ASM,005.2.01.0,6,100.974675,MALE
2,GS00354-DNA_D01,GS000002816-ASM,010.2.02.0,48,91.162218,FEMALE
3,GS00354-DNA_E01,GS000002817-ASM,013.2.01.0,49,94.034223,MALE
4,GS00354-DNA_C01,GS000002815-ASM,014.2.02.0,43,93.190965,MALE
5,GS00354-DNA_F01,GS000002818-ASM,018.2.01.0,53,95.263518,MALE
6,GS00354-DNA_A02,GS000002754-ASM,019.2.01.0,130,96.851472,MALE
7,GS00354-DNA_H01,GS000002820-ASM,023.2.02.0,99,94.264203,FEMALE
8,GS00354-DNA_G01,GS000002819-ASM,024.2.01.0,98,100.205339,FEMALE
9,GS00354-DNA_B02,GS000002755-ASM,029.2.01.0,180,97.199179,FEMALE
