# Tandem Repeat analysis
#### Written by Jigar N. Bandaria
In this notebook we continue the analysis on the file generated from the notebook 'sgRNA analysis 3'.

In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import os
!ls

Analysis.odt  dup_uni			      SgRNA analysis.ipynb
Bam_files     PAMs			      sgRNA anslysis 2.1-Copy1.ipynb
chrom_only    sgRNA analysis 3.1-Copy1.ipynb


In [4]:
#Loading the contents of the file.
dups = pd.read_csv('/home/user/Desktop/nt_17/dup_uni/dup_with_one_chrom.fa',header=None,sep='\t',names=['Sequence','Chromosome','Position'])
print (len(dups))
dups.head()


6380937


Unnamed: 0,Sequence,Chromosome,Position
0,TTCAGCTTCCAGCTCCC,chr10,123165-123182(+)
1,TTCAGCTTCCAGCTCCC,chr10,123185-123202(+)
2,CTCAGGGTGGAGGCTCA,chr10,125637-125654(-)
3,GCTCAGGGTGGAGGCTC,chr10,125638-125655(-)
4,CTGGGCTGAGCTCAGGG,chr10,125647-125664(-)


In [5]:
#I first split the file and create one for each chromosome. This simplifies analysis later.
file_path = "/home/user/Desktop/nt_17/chrom_only/"

chr_num = ['chr1','chr2','chr3','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10',
           'chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20',
           'chr21','chr22','chrX','chrY']
for x in chr_num:
    dup_chr = dups[dups.Chromosome==x]
    filename = file_path+x+"_only.fa"
    dup_chr.to_csv(filename,header=None,index=None,sep='\t')

Let's load one of the files and do some preliminary analysis.

In [8]:
tmp1 = pd.read_csv('chrom_only/chr22_only.fa',header=None,sep='\t',names=['Sequence','Chromosome','Position'])
tmp2 = tmp1.groupby(['Sequence']).count()

In [9]:
print(len(tmp1)) # Total sequences in the file ..Chr22
print(len(tmp2)) # Total hotspots that the sequences can be grouped into.

183043
66408


In [10]:
#Here we removes sequences that repeat less than 4 times.
tmp3 = tmp1.Sequence.value_counts()
#print (tmp3)
print(len(tmp3))
tmp4 = tmp3.index[tmp3<4].tolist()
print(len(tmp4))
#print(tmp4)

66408
58341


In [11]:
tmp5 = tmp1.mask(tmp1.Sequence.isin(tmp4)) # 'mask' here is a pandas function
tmp5.dropna(how='any',inplace=True)

In [12]:
len(tmp5) # Number of sequences that remain after removing the ones that occur less than 4 times.

49912

Based on above analysis, we repeat it on all the chromosome and create file for each chromosome that contain only sequences that occur more than 4 times.

In [14]:
#file_path = "/home/user/Desktop/sgRNA/Test/"

#chr_num=['chr22']
chr_num = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10',
           'chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20',
           'chr21','chr22','chrX','chrY']
for x in chr_num:
    input_file = file_path+x+"_only.fa"
    output_file = file_path+x+"_4more.fa"
    tmp1 = pd.read_csv(input_file,header=None,sep='\t',names=['Sequence','Chromosome','Position'])
    
    tmp3 = tmp1.Sequence.value_counts()
    tmp4 = tmp3.index[tmp3<4].tolist()
    tmp5 = tmp1.mask(tmp1.Sequence.isin(tmp4))
    tmp5.dropna(how='any',inplace=True)
    tmp5.to_csv(output_file,header=None,index=None,sep='\t')

Below is the statistics for each chromosome. It shows how many sequences are present on each chromosome, and how many hotspots they can form.

In [15]:
print ("Name : Reads : Duplicates")
for x in chr_num:
    filename = file_path+x+"_4more.fa"
    tmp1 = pd.read_csv(filename,header=None,sep='\t',names=['Sequence','Chromosome','Position'])
    
    print("{0} : {1} : {2}".format(x,len(tmp1),len(tmp1.Sequence.value_counts())))



Name : Reads : Duplicates
chr1 : 168852 : 22919
chr2 : 95715 : 14935
chr3 : 17838 : 2054
chr4 : 41252 : 4405
chr5 : 65095 : 9281
chr6 : 32262 : 4602
chr7 : 108754 : 17362
chr8 : 64918 : 7933
chr9 : 271391 : 51593
chr10 : 118367 : 20675
chr11 : 33668 : 5162
chr12 : 28501 : 3468
chr13 : 27791 : 3377
chr14 : 19806 : 3161
chr15 : 129763 : 22458
chr16 : 103819 : 17726
chr17 : 98052 : 16520
chr18 : 19003 : 2308
chr19 : 95626 : 13207
chr20 : 18169 : 2570
chr21 : 11412 : 1627
chr22 : 49912 : 8067
chrX : 82399 : 13193
chrY : 154979 : 28816


Below we look at chr 9, and calculate how many sequences are repeated 4 times, 5 times ...etc.

In [16]:
filename = file_path+"chr9"+"_4more.fa"
tmp1 = pd.read_csv(filename,header=None,sep='\t',names=['Sequence','Chromosome','Position'])
x = "chr9"   
print("{0} : {1} : {2}".format(x,len(tmp1),len(tmp1.Sequence.value_counts())))

tmp1.Sequence.value_counts().value_counts().sort_index()

chr9 : 271391 : 51593


4      20337
5      15549
6       7965
7       4509
8       1648
9        289
10       393
11       462
12       107
13        75
14        49
15        21
16        32
17        15
18         9
19        12
20        12
21         9
22        10
23         9
24         5
25         6
26        14
27         4
28         8
29         5
30         1
31         4
32         1
33         2
34         1
35         1
36         1
37         1
38         3
39         1
40         2
41         2
42         1
43         1
44         4
45         4
46         3
61         2
80         2
82         1
112        1
Name: Sequence, dtype: int64

concatenate all the chr*_with_overlap.fa for further analysis