# Running arraylib on example data

First we import the LibraryExperiment class from the package.

In [1]:
from pysudoku_package.libraryexperiment import LibraryExperiment


We generate an experiment object by instantiating the LibraryExperiment class with our chosen parameters.

Required parameters:
* `input_dir`: path to directory holding the input fastq files
* `exp_design`: path to csv file indicating experimental design (values should be separated by a comma). The experimental design file 
       should have columns, Filename, Poolname and Pooldimension. (see example in tests/test_data/full_exp_design.csv)
  * Filename should contain all the unqiue input fastq filenames.
  * Poolname should indicate to which pool a given file belongs. Multiple files per poolname are allowed.
  * Pooldimension indicates the pooling dimension a pool belongs to. All pools sharing the same pooling dimension should have the same string in the Pooldimension column.
  

An example of how an exp_design file could look like:

| Filename          | Poolname        | Pooldimension  |
| :---------------: | :-------------: | :------------: |
| column1.fastq     | column1         | columns        |
| column2.fastq     | column2         | columns        |
| row1.fastq        | row1            | rows           |
| row2.fastq        | row2            | rows           |
| platerow1.fastq   | platerow1       | platerows      |
| platerow2.fastq   | platerow2       | platerows      |
| platecol1.fastq   | platecol1       | platecols      |
| platecol2.fastq   | platecol2       | platecols      |

* `gb\_ref` path to genbank reference file
* `bowtie\_ref` path to bowtie index files, ending with the basename of your index (if the basename of your index is UTI89 and you store your bowtie2 references in `bowtie\_ref` it should be bowtie_ref/UTI89). Please visit https://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#the-bowtie2-build-indexer for a manual how to create bowtie2 indices.
* `tn\_seq` transposon sequence (e.g. AGATGTGTATAAGAGACAG)
* `bar\_upstream` upstream sequence of barcode (e.g. CGAGGTCTCT)
* `bar\_downstream` downstream sequence of barcode (e.g. CGTACGCTGC)

Optional parameters:
* map\_quality minimum bowtie2 alignment quality score for each base to include read
* seq\_quality minimum phred score for each base to include read
* transposon\_mismatches number of transposon mismatches allowed
* filter\_thr threshold for local filter (e.g. a threshold of 0.05 would filter out all reads < 0.05 of the maximum read count for a given mutant)
* global\_thr threshold for global filter (all reads below g_thr will be set to 0) 
* min_counts minimum counts of a barcode to be included in analysis
* use\_barcodes whether to perform deconvolution only based on barcodes without genomic alignment 


You can also refer to the API to get a detailed description of the input parameter choices.

In [3]:
experiment = LibraryExperiment(cores=4,
                               map_quality=30, 
                               seq_quality=10, 
                               gb_ref="gb_ref", 
                               bowtie_ref="bowtie_ref/UTI89",     
                               tn_seq="AGATGTGTATAAGAGACAG", 
                               tn_mismatches=2, 
                               input_dir="input", 
                               exp_design="full_exp_design.csv",
                               use_barcodes=False, 
                               bar_upstream="CGAGGTCTCT", 
                               bar_downstream="CGTACGCTGC", 
                               filter_thr=0.05,
                               global_filter_thr=5, 
                               min_counts=5)


## Read trimming and genomic alignment

First we detect the transposon sequence in the reads in our input fastq files in input_dir.

In [4]:
# detect transposon and trim sequences
experiment.get_genomic_seq()    

In [5]:
# align trimmed sequences to reference using bowtie2
experiment.align_genomic_seq()   

624483 reads; of these:
  624483 (100.00%) were unpaired; of these:
    0 (0.00%) aligned 0 times
    624483 (100.00%) aligned exactly 1 time
    0 (0.00%) aligned >1 times
100.00% overall alignment rate


In [6]:
# assemble count matrix from aligned reads
experiment.write_count_matrix()

Generating count matrix with 7000 mutants
Generated count matrix!


In [10]:
experiment.count_mat

Unnamed: 0,Feature,Orientation,Barcode,Reference,1,10,11,12,2,3,...,PC07,PC08,PR01,PR02,PR03,PR04,PR05,PR06,PR07,PR08
0,49,-,,NC_007946.1,0,0,10,0,0,0,...,0,0,0,14,0,0,0,0,0,0
1,50,-,,NC_007946.1,0,0,0,0,0,0,...,0,13,0,0,0,0,0,0,0,13
2,51,-,,NC_007946.1,0,0,0,0,0,0,...,0,0,0,0,0,11,0,0,0,0
3,53,-,,NC_007946.1,0,0,0,0,13,0,...,10,0,10,0,0,0,0,0,0,0
4,54,+,,NC_007946.1,0,13,0,0,0,0,...,0,14,0,0,0,0,0,14,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,7045,-,,NC_007946.1,0,10,0,0,0,14,...,0,0,13,0,0,0,0,0,11,0
6996,7046,+,,NC_007946.1,0,10,0,0,0,0,...,0,0,0,13,0,0,12,11,0,0
6997,7047,-,,NC_007946.1,0,0,0,0,0,0,...,0,0,0,14,12,0,0,0,11,0
6998,7048,+,,NC_007946.1,14,0,0,0,10,11,...,0,0,0,0,0,0,13,0,0,14


In [11]:
experiment.normalized_count_mat

Unnamed: 0,Feature,Orientation,Barcode,Reference,1,10,11,12,2,3,...,PC07,PC08,PR01,PR02,PR03,PR04,PR05,PR06,PR07,PR08
0,49,-,,NC_007946.1,0.00000,0.000000,734.322221,0.0,0.000000,0.000000,...,0.00000,0.000000,0.000000,732.945919,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,50,-,,NC_007946.1,0.00000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.00000,675.465032,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,664.349959
2,51,-,,NC_007946.1,0.00000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,581.856652,0.000000,0.000000,0.000000,0.000000
3,53,-,,NC_007946.1,0.00000,0.000000,0.000000,0.0,980.022616,0.000000,...,496.15480,0.000000,509.735957,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,54,+,,NC_007946.1,0.00000,981.798958,0.000000,0.0,0.000000,0.000000,...,0.00000,727.423880,0.000000,0.000000,0.000000,0.000000,0.000000,721.277692,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,7045,-,,NC_007946.1,0.00000,755.229968,0.000000,0.0,0.000000,1003.152766,...,0.00000,0.000000,662.656744,0.000000,0.000000,0.000000,0.000000,0.000000,549.862534,0.000000
6996,7046,+,,NC_007946.1,0.00000,755.229968,0.000000,0.0,0.000000,0.000000,...,0.00000,0.000000,0.000000,680.592639,0.000000,0.000000,629.954328,566.718187,0.000000,0.000000
6997,7047,-,,NC_007946.1,0.00000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.00000,0.000000,0.000000,732.945919,635.930048,0.000000,0.000000,0.000000,549.862534,0.000000
6998,7048,+,,NC_007946.1,1108.64745,0.000000,0.000000,0.0,753.863551,788.191459,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,682.450522,0.000000,0.000000,715.453802


In [9]:
experiment.filtered_count_mat

Unnamed: 0,Feature,Orientation,Barcode,Reference,1,10,11,12,2,3,...,PC07,PC08,PR01,PR02,PR03,PR04,PR05,PR06,PR07,PR08
0,49,-,,NC_007946.1,0.00000,0.000000,734.322221,0.0,0.000000,0.000000,...,0.00000,0.000000,0.000000,732.945919,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,50,-,,NC_007946.1,0.00000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.00000,675.465032,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,664.349959
2,51,-,,NC_007946.1,0.00000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,581.856652,0.000000,0.000000,0.000000,0.000000
3,53,-,,NC_007946.1,0.00000,0.000000,0.000000,0.0,980.022616,0.000000,...,496.15480,0.000000,509.735957,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,54,+,,NC_007946.1,0.00000,981.798958,0.000000,0.0,0.000000,0.000000,...,0.00000,727.423880,0.000000,0.000000,0.000000,0.000000,0.000000,721.277692,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,7045,-,,NC_007946.1,0.00000,755.229968,0.000000,0.0,0.000000,1003.152766,...,0.00000,0.000000,662.656744,0.000000,0.000000,0.000000,0.000000,0.000000,549.862534,0.000000
6996,7046,+,,NC_007946.1,0.00000,755.229968,0.000000,0.0,0.000000,0.000000,...,0.00000,0.000000,0.000000,680.592639,0.000000,0.000000,629.954328,566.718187,0.000000,0.000000
6997,7047,-,,NC_007946.1,0.00000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.00000,0.000000,0.000000,732.945919,635.930048,0.000000,0.000000,0.000000,549.862534,0.000000
6998,7048,+,,NC_007946.1,1108.64745,0.000000,0.000000,0.0,753.863551,788.191459,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,682.450522,0.000000,0.000000,715.453802


In [12]:
experiment.raw_count_mat

Unnamed: 0,Feature,Orientation,Barcode,Reference,1,10,11,12,2,3,...,PC07,PC08,PR01,PR02,PR03,PR04,PR05,PR06,PR07,PR08
0,49,-,,NC_007946.1,0,0,10,0,0,0,...,0,0,0,14,0,0,0,0,0,0
1,50,-,,NC_007946.1,0,0,0,0,0,0,...,0,13,0,0,0,0,0,0,0,13
2,51,-,,NC_007946.1,0,0,0,0,0,0,...,0,0,0,0,0,11,0,0,0,0
3,53,-,,NC_007946.1,0,0,0,0,13,0,...,10,0,10,0,0,0,0,0,0,0
4,54,+,,NC_007946.1,0,13,0,0,0,0,...,0,14,0,0,0,0,0,14,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,7045,-,,NC_007946.1,0,10,0,0,0,14,...,0,0,13,0,0,0,0,0,11,0
6996,7046,+,,NC_007946.1,0,10,0,0,0,0,...,0,0,0,13,0,0,12,11,0,0
6997,7047,-,,NC_007946.1,0,0,0,0,0,0,...,0,0,0,14,12,0,0,0,11,0
6998,7048,+,,NC_007946.1,14,0,0,0,10,11,...,0,0,0,0,0,0,13,0,0,14


In [33]:
experiment.normalized_count_mat[experiment.pools].values

array([[   0.        ,    0.        ,  734.32222059, ...,    0.        ,
           0.        ,    0.        ],
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
           0.        ,  664.34995912],
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       ...,
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
         549.86253437,    0.        ],
       [1108.64745011,    0.        ,    0.        , ...,    0.        ,
           0.        ,  715.45380213],
       [   0.        ,    0.        ,  734.32222059, ...,    0.        ,
           0.        ,    0.        ]])

In [27]:
# Inferring most likely locations and writing location summary
experiment.deconvolve()

In [29]:
vars(experiment)

{'cores': 4,
 'map_quality': 30,
 'seq_quality': 10,
 'gb_ref': 'gb_ref',
 'bowtie_ref': 'bowtie_ref/UTI89',
 'tn_seq': 'AGATGTGTATAAGAGACAG',
 'tn_mismatches': 2,
 'input_dir': 'input',
 'exp_design': 'full_exp_design.csv',
 'use_barcodes': False,
 'filter_thr': 0.05,
 'global_filter_thr': 5,
 'min_counts': 5,
 'bowtie_res': 'temp/alignment_result.csv',
 'alignment': 'temp/alignment.sam',
 'file2pool': {'A': 'A',
  'B': 'B',
  'C': 'C',
  'D': 'D',
  'E': 'E',
  'F': 'F',
  'G': 'G',
  'H': 'H',
  '1': '1',
  '2': '2',
  '3': '3',
  '4': '4',
  '5': '5',
  '6': '6',
  '7': '7',
  '8': '8',
  '9': '9',
  '10': '10',
  '11': '11',
  '12': '12',
  'PC01': 'PC01',
  'PC02': 'PC02',
  'PC03': 'PC03',
  'PC04': 'PC04',
  'PC05': 'PC05',
  'PC06': 'PC06',
  'PC07': 'PC07',
  'PC08': 'PC08',
  'PR01': 'PR01',
  'PR02': 'PR02',
  'PR03': 'PR03',
  'PR04': 'PR04',
  'PR05': 'PR05',
  'PR06': 'PR06',
  'PR07': 'PR07',
  'PR08': 'PR08'},
 'pool_dims': {'row': ['A', 'B', 'C', 'D', 'E', 'F', 'G', '