
# Tandem Repeat analysis
#### Written by Jigar N. Bandaria
In this notebook I will confirm my analysis. I only want to save sgRNA sequences that have NGG at the 3' end. The sequences should also be repetitive atleast 4 times within a 10 kbp window, and be present only on one chromosome.

In [2]:
import pandas as pd
import numpy as np


Here I use Bowtie to first align all the sequences that we had saved from the previous notebook.

In [3]:
%%bash
/home/user/Desktop/Bioinformatics/bowtie-1.1.2/./bowtie -t -S -a -v 0 -y -f  /home/user/Desktop/Bioinformatics/genome_index/GRCh37 /home/user/Desktop/nt_17/no_rev_comp.fasta > no_rev_comp_grc37.sam

Time loading forward index: 00:00:09
Time for 0-mismatch search: 00:00:06
# reads processed: 70795
# reads with at least one reported alignment: 70795 (100.00%)
# reads that failed to align: 0 (0.00%)
Reported 729062 alignments to 1 output stream(s)
Time searching: 00:00:15
Overall time: 00:00:15


In [3]:
#above I have checked my sequences in human genome. I am loading the results.
unmatched_seq = pd.read_table('/home/user/Desktop/nt_17/no_rev_comp_grc37.sam',skiprows=27,header=None,usecols=[0,1,2,3,9], names=['seqid','strand','chr','start','Sequence'])
unmatched_seq.head()

Unnamed: 0,seqid,strand,chr,start,Sequence
0,seq0,0,chr13,112957261,GGACCATTCCTTCAGGA
1,seq0,0,chr13,112941860,GGACCATTCCTTCAGGA
2,seq0,0,chr13,112934077,GGACCATTCCTTCAGGA
3,seq0,0,chr13,112963854,GGACCATTCCTTCAGGA
4,seq0,0,chr13,112954065,GGACCATTCCTTCAGGA


In order to confirm that the sequences are follows by NGG or preceded by CCN (if they are the opposite strand) I used bedtools to slop those sequences. This is all included in the bash script 'compare_ngg.sh'

In [7]:
%%bash
./compare_ngg.sh

In [4]:
#loading the slopped sequences:
slopped_seq = pd.read_table('/home/user/Desktop/nt_17/slop_fasta.fa',header=None,names=['chr_p','seq23'])
slopped_seq['seq23'] = slopped_seq.seq23.str.upper()
slopped_seq.head()

Unnamed: 0,chr_p,seq23
0,chr13:112957257-112957280(+),CATGGACCATTCCTTCAGGATGG
1,chr13:112941856-112941879(+),CATGGACCATTCCTTCAGGATGG
2,chr13:112934073-112934096(+),CATGGACCATTCCTTCAGGATGG
3,chr13:112963850-112963873(+),CATGGACCATTCCTTCAGGATGG
4,chr13:112954061-112954084(+),CATGGACCATTCCTTCAGGATGG


In [5]:
#check length of unmatched_seq and slopped_seq. Are they equal?
print (len(unmatched_seq),len(slopped_seq))

729062 729062


In [6]:
#Adding the 3 nucleotides that are PAMs
slopped_seq['NGG5'] = slopped_seq['seq23'].str[:3]
slopped_seq['NGG3'] = slopped_seq['seq23'].str[-3:]
slopped_seq.tail(10)

Unnamed: 0,chr_p,seq23,NGG5,NGG3
729052,chr10:37467845-37467868(+),GTGGGTGCTCTGGAGACTACTGG,GTG,TGG
729053,chr21:18929000-18929023(-),CCAGGTGCTCTGGAGACTACATC,CCA,ATC
729054,chr17:80108900-80108923(-),TCTGGGTGTGCGCCAGGTTCTGG,TCT,TGG
729055,chr17:80107712-80107735(-),TCCGGGTGTGCGCCAGGTTCTGG,TCC,TGG
729056,chr17:80107748-80107771(-),TCTGGGTGTGCGCCAGGTTCTGG,TCT,TGG
729057,chr17:80109350-80109373(-),TCTGGGTGTGCGCCAGGTTCTGG,TCT,TGG
729058,chr10:393010-393033(-),GCGATGTGTCCGGGTGAGTGTGG,GCG,TGG
729059,chr10:393417-393440(-),GCAATGTGTCCGGGTGAGTGTGG,GCA,TGG
729060,chr10:393096-393119(-),GCGATGTGTCCGGGTGAGTGTGG,GCG,TGG
729061,chr10:393053-393076(-),GCGATGTGTCCGGGTGAGTGTGG,GCG,TGG


In [7]:
#Concatenating both the file obtained from Bowtie and Bedtools.
slop_both = pd.concat([unmatched_seq,slopped_seq],axis=1)
slop_both.head()

Unnamed: 0,seqid,strand,chr,start,Sequence,chr_p,seq23,NGG5,NGG3
0,seq0,0,chr13,112957261,GGACCATTCCTTCAGGA,chr13:112957257-112957280(+),CATGGACCATTCCTTCAGGATGG,CAT,TGG
1,seq0,0,chr13,112941860,GGACCATTCCTTCAGGA,chr13:112941856-112941879(+),CATGGACCATTCCTTCAGGATGG,CAT,TGG
2,seq0,0,chr13,112934077,GGACCATTCCTTCAGGA,chr13:112934073-112934096(+),CATGGACCATTCCTTCAGGATGG,CAT,TGG
3,seq0,0,chr13,112963854,GGACCATTCCTTCAGGA,chr13:112963850-112963873(+),CATGGACCATTCCTTCAGGATGG,CAT,TGG
4,seq0,0,chr13,112954065,GGACCATTCCTTCAGGA,chr13:112954061-112954084(+),CATGGACCATTCCTTCAGGATGG,CAT,TGG


Below I will perform analysis to make sure that the sequences that present only on one chromosome.

In [8]:
grp1 = slop_both.groupby('seqid')['chr'].unique()
grp1#grouping to check if any sequences on multiple chromosomes


seqid
seq0                                  [chr13]
seq1                                   [chr8]
seq10                           [chr3, chr15]
seq100                     [chr3, chr6, chr4]
seq1000                               [chr17]
seq10000                              [chr17]
seq10001                               [chr4]
seq10002                 [chr11, chr13, chr1]
seq10003    [chr17, chr10, chr1, chr18, chr4]
seq10004                               [chr5]
seq10005                               [chr3]
seq10006                        [chr9, chr22]
seq10007                        [chr19, chr2]
seq10008                              [chr21]
seq10009                [chr21, chr19, chr16]
seq1001                                [chr4]
seq10010                              [chr12]
seq10011                              [chr13]
seq10012                               [chr2]
seq10013                              [chr10]
seq10014                              [chr19]
seq10015                    

In [10]:
len(grp1)

70795

In [None]:
grp2 = grp1.value_counts() #counting how many on each chromosome  based on seq_id


In [12]:
import pickle
with open('grp2.pkl','wb') as fid:
    pickle.dump (grp2,fid)

In [13]:
grp2

[chr1]                                    3755
[chr2]                                    3015
[chr19]                                   2977
[chr7]                                    2966
[chr10]                                   2831
[chr8]                                    2439
[chr17]                                   2196
[chr4]                                    2045
[chr6]                                    1994
[chr13]                                   1966
[chr5]                                    1949
[chr9]                                    1924
[chr16]                                   1814
[chr12]                                   1747
[chrX]                                    1706
[chr11]                                   1640
[chr18]                                   1388
[chr22]                                   1226
[chr20]                                   1207
[chr3]                                     997
[chr21]                                    879
[chr14]      

There are some sequences that are present on multiple chromosome. This is not because of some error in our previous analysis. This is because Bowtie returns all the homologous sequences in the genome, whether they are followed by a PAM or not. Once we remove the sequences from the Bowtie analysis that are not followed by NGG, we would observe that only chromosome specific sequences will remain.

In [14]:
grp2.value_counts()#counting how many ones based on seq_id
#These sequences are present on more than one chrosome. Need to check if when on other
#chromosomes they have a NGG on 3'

1       26468
879         1
1207        1
2977        1
1747        1
1924        1
2196        1
997         1
2439        1
3015        1
1814        1
2966        1
1640        1
1966        1
361         1
585         1
1226        1
1706        1
3755        1
1994        1
1388        1
1949        1
2045        1
2831        1
720         1
Name: chr, dtype: int64

In [15]:
mask1 = [len(x)!=1 for x in grp1] 
#creating a mask to get the ones that are on more than one chr.

In [18]:
more_than_1 = grp1[mask1] #this is the list of more than 1

In [20]:
more_than_1[:5]

seqid
seq10                           [chr3, chr15]
seq100                     [chr3, chr6, chr4]
seq10002                 [chr11, chr13, chr1]
seq10003    [chr17, chr10, chr1, chr18, chr4]
seq10006                        [chr9, chr22]
Name: chr, dtype: object

Below is a dataframe showing sequences that are present on more than 1.

In [24]:
#create data with only more than one sequences
remain_seq_1 = slop_both[slop_both.seqid.isin(more_than_1.index)]
print(remain_seq_1.shape)
remain_seq_1[remain_seq_1.seqid=='seq55055'].sort_values(['chr'])

(330250, 9)


Unnamed: 0,seqid,strand,chr,start,Sequence,chr_p,seq23,NGG5,NGG3
631513,seq55055,16,chr1,152327464,GATGACTGACTTGAGCC,chr1:152327460-152327483(-),TCTGGCTCAAGTCAGTCATCTGG,TCT,TGG
631514,seq55055,16,chr1,152328376,GATGACTGACTTGAGCC,chr1:152328372-152328395(-),TCTGGCTCAAGTCAGTCATCTGG,TCT,TGG
631515,seq55055,16,chr1,152327920,GATGACTGACTTGAGCC,chr1:152327916-152327939(-),TCTGGCTCAAGTCAGTCATCTGG,TCT,TGG
631517,seq55055,16,chr1,152327233,GATGACTGACTTGAGCC,chr1:152327229-152327252(-),TCTGGCTCAAGTCAGTCATCTGG,TCT,TGG
631512,seq55055,0,chr14,65118519,GGCTCAAGTCAGTCATC,chr14:65118515-65118538(+),TTGGGCTCAAGTCAGTCATCCTC,TTG,CTC
631511,seq55055,0,chr19,42218255,GGCTCAAGTCAGTCATC,chr19:42218251-42218274(+),TCTGGCTCAAGTCAGTCATCATC,TCT,ATC
631516,seq55055,16,chr19,43027546,GATGACTGACTTGAGCC,chr19:43027542-43027565(-),ACTGGCTCAAGTCAGTCATCATC,ACT,ATC


Below are check for the ones that are followed by NGG and remove the ones that are not.

In [28]:
#based on some checking the criteria whould be NGG3
remain_seq_2 = remain_seq_1[remain_seq_1.NGG3.isin(['AGG','TGG','CGG','GGG'])]
len(remain_seq_2)

203532

In [29]:
remain_seq_2.head()

Unnamed: 0,seqid,strand,chr,start,Sequence,chr_p,seq23,NGG5,NGG3
809,seq3,0,chr13,112956894,TTCTTCAGGATGGGCCC,chr13:112956890-112956913(+),TGTTTCTTCAGGATGGGCCCAGG,TGT,AGG
810,seq3,0,chr13,112964439,TTCTTCAGGATGGGCCC,chr13:112964435-112964458(+),TGTTTCTTCAGGATGGGCCCAGG,TGT,AGG
811,seq3,0,chr13,112951863,TTCTTCAGGATGGGCCC,chr13:112951859-112951882(+),TGTTTCTTCAGGATGGGCCCAGG,TGT,AGG
812,seq3,0,chr13,112934832,TTCTTCAGGATGGGCCC,chr13:112934828-112934851(+),TGTTTCTTCAGGATGGGCCCAGG,TGT,AGG
813,seq3,0,chr13,112962638,TTCTTCAGGATGGGCCC,chr13:112962634-112962657(+),TGTTTCTTCAGGATGGGCCCAGG,TGT,AGG


Rechecking to make sure now that each sequence is present only one chromosome.

In [33]:
#check the groupings now
#check1 : by seq_id

check_1 = remain_seq_2.groupby('seqid')['chr'].unique()
check_1[:10]

seqid
seq10        [chr3]
seq100       [chr3]
seq10002    [chr11]
seq10003    [chr17]
seq10006    [chr22]
seq10007    [chr19]
seq10009    [chr19]
seq10017     [chr2]
seq10018    [chr19]
seq10022    [chr16]
Name: chr, dtype: object

In [34]:
check_1.value_counts()

[chr19]    2025
[chr1]     1953
[chr2]     1867
[chr10]    1687
[chr7]     1674
[chr8]     1442
[chr17]    1307
[chr6]     1213
[chr16]    1143
[chr4]     1140
[chr9]     1137
[chr12]    1081
[chr5]     1077
[chr13]    1033
[chr11]     978
[chr20]     923
[chrX]      853
[chr18]     801
[chr22]     778
[chr3]      613
[chr21]     598
[chr14]     481
[chr15]     369
[chrY]      295
Name: chr, dtype: int64

#### Now there are no sequences that are only more than one chromosomes. This means that the sequences that were found on other chromosomes were not followed by PAM sequences. Hence, all the files in the unmatched sequences are fine. 

Below is the final analysis that will be saved.

In [38]:
#final analysis on the all the alignments slop_both
#ALL aligments also include matches on other chromosomes but dont have NGG at 3'
#I want to remove them
#first add another column seq23

slop_both['seq20'] = slop_both['seq23'].str[-20:]

print ('Initial ALL Alignments : {}'.format(len(slop_both)))

true_alignments = slop_both[slop_both.NGG3.isin(['AGG','TGG','CGG','GGG'])]
print ('Remianing sequences after removing non-NGG : {}'.format(len(true_alignments)))

Initial ALL Alignments : 729062
Remianing sequences after removing non-NGG : 554634


In [41]:
print('Total sequences based on seq_id groupby : {}'.format(len(true_alignments.groupby('seqid').sum())))


Total sequences based on seq_id groupby : 70795


In [44]:
true_alignments.tail(10)

Unnamed: 0,seqid,strand,chr,start,Sequence,chr_p,seq23,NGG5,NGG3,seq20
729051,seq70792,0,chr10,37456123,GGTGCTCTGGAGACTAC,chr10:37456119-37456142(+),GTGGGTGCTCTGGAGACTACTGG,GTG,TGG,GGTGCTCTGGAGACTACTGG
729052,seq70792,0,chr10,37467849,GGTGCTCTGGAGACTAC,chr10:37467845-37467868(+),GTGGGTGCTCTGGAGACTACTGG,GTG,TGG,GGTGCTCTGGAGACTACTGG
729054,seq70793,16,chr17,80108904,GAACCTGGCGCACACCC,chr17:80108900-80108923(-),TCTGGGTGTGCGCCAGGTTCTGG,TCT,TGG,GGGTGTGCGCCAGGTTCTGG
729055,seq70793,16,chr17,80107716,GAACCTGGCGCACACCC,chr17:80107712-80107735(-),TCCGGGTGTGCGCCAGGTTCTGG,TCC,TGG,GGGTGTGCGCCAGGTTCTGG
729056,seq70793,16,chr17,80107752,GAACCTGGCGCACACCC,chr17:80107748-80107771(-),TCTGGGTGTGCGCCAGGTTCTGG,TCT,TGG,GGGTGTGCGCCAGGTTCTGG
729057,seq70793,16,chr17,80109354,GAACCTGGCGCACACCC,chr17:80109350-80109373(-),TCTGGGTGTGCGCCAGGTTCTGG,TCT,TGG,GGGTGTGCGCCAGGTTCTGG
729058,seq70794,16,chr10,393014,CACTCACCCGGACACAT,chr10:393010-393033(-),GCGATGTGTCCGGGTGAGTGTGG,GCG,TGG,ATGTGTCCGGGTGAGTGTGG
729059,seq70794,16,chr10,393421,CACTCACCCGGACACAT,chr10:393417-393440(-),GCAATGTGTCCGGGTGAGTGTGG,GCA,TGG,ATGTGTCCGGGTGAGTGTGG
729060,seq70794,16,chr10,393100,CACTCACCCGGACACAT,chr10:393096-393119(-),GCGATGTGTCCGGGTGAGTGTGG,GCG,TGG,ATGTGTCCGGGTGAGTGTGG
729061,seq70794,16,chr10,393057,CACTCACCCGGACACAT,chr10:393053-393076(-),GCGATGTGTCCGGGTGAGTGTGG,GCG,TGG,ATGTGTCCGGGTGAGTGTGG


### Below is the final file. The data in this is not grouped, and hence this file can be used for further statistical analysis and plotting.

In [45]:
true_alignments.to_csv('/home/user/Desktop/nt_17/true_all_cols_17.csv',header=True,index=False)

Below I group the data to create a table that is easier to read, however this will make it difficult for analysis.

In [46]:
#creating pivot table
true_alignments.pivot_table(index=['chr','seqid','Sequence','seq20','strand','chr_p'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,start
chr,seqid,Sequence,seq20,strand,chr_p,Unnamed: 6_level_1
chr1,seq1002,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011583-1011606(+),1011587
chr1,seq1002,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011613-1011636(+),1011617
chr1,seq1002,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011643-1011666(+),1011647
chr1,seq1002,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011673-1011696(+),1011677
chr1,seq1002,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011881-1011904(+),1011885
chr1,seq1002,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011911-1011934(+),1011915
chr1,seq1002,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011941-1011964(+),1011945
chr1,seq1002,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011971-1011994(+),1011975
chr1,seq1002,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1012001-1012024(+),1012005
chr1,seq1002,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1012031-1012054(+),1012035


In [47]:
coun_temp = true_alignments.pivot_table(index=['chr','seqid'],aggfunc=[len])

In [None]:
coun_temp.sortlevel(1,inplace=True)
coun_temp.head(10)

This output below is fine except it would be nice to have another column that has a count of the number of times the sequences is repeated.

In [50]:
true_alignments_table = true_alignments.pivot_table(index=['chr','seqid','Sequence','seq20','strand','chr_p'])
true_alignments_table.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,start
chr,seqid,Sequence,seq20,strand,chr_p,Unnamed: 6_level_1
chr1,seq1002,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011583-1011606(+),1011587
chr1,seq1002,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011613-1011636(+),1011617
chr1,seq1002,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011643-1011666(+),1011647
chr1,seq1002,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011673-1011696(+),1011677
chr1,seq1002,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011881-1011904(+),1011885
chr1,seq1002,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011911-1011934(+),1011915
chr1,seq1002,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011941-1011964(+),1011945
chr1,seq1002,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011971-1011994(+),1011975
chr1,seq1002,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1012001-1012024(+),1012005
chr1,seq1002,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1012031-1012054(+),1012035


In the cells below I modefiy the multi-index in panda to include a column for the counts.

In [51]:
true_alignments_table.index.levels[5]

Index(['chr10:1000689-1000712(-)', 'chr10:1000690-1000713(-)',
       'chr10:1000706-1000729(-)', 'chr10:1000715-1000738(-)',
       'chr10:1000716-1000739(-)', 'chr10:1000732-1000755(-)',
       'chr10:1000741-1000764(-)', 'chr10:1000742-1000765(-)',
       'chr10:1000758-1000781(-)', 'chr10:1000767-1000790(-)',
       ...
       'chrY:58917620-58917643(-)', 'chrY:58917636-58917659(-)',
       'chrY:58993054-58993077(+)', 'chrY:58993055-58993078(+)',
       'chrY:58995233-58995256(+)', 'chrY:58996054-58996077(+)',
       'chrY:58996055-58996078(+)', 'chrY:58996122-58996145(+)',
       'chrY:58996123-58996146(+)', 'chrY:58996739-58996762(+)'],
      dtype='object', name='chr_p', length=554634)

In [52]:
true_alignments_table.index.labels[4]

FrozenNDArray([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...], dtype='int8')

In [53]:
ta1= true_alignments_table

In [54]:
levels_ta1=[ta1.index.levels[0],ta1.index.levels[1],[coun_temp.len.NGG3.values],ta1.index.levels[2],ta1.index.levels[3],ta1.index.levels[4],ta1.index.levels[5]]
labels_ta1 = [ta1.index.labels[0],ta1.index.labels[1],ta1.index.labels[1],ta1.index.labels[2],ta1.index.labels[3],ta1.index.labels[4],ta1.index.labels[5]]

ta1_index = pd.MultiIndex(levels=levels_ta1,labels=labels_ta1)

In [55]:
ta1.set_index(ta1_index,inplace=True)

In [59]:
ta1.tail(800)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,start
chr,seqid,Unnamed: 2_level_1,Sequence,seq20,strand,chr_p,Unnamed: 7_level_1
chrY,seq55600,4,GACGCAGCAGACCTTTA,GACGCAGCAGACCTTTACGG,0,chrY:22460080-22460103(+),22460084
chrY,seq55600,4,GACGCAGCAGACCTTTA,GACGCAGCAGACCTTTACGG,0,chrY:22466805-22466828(+),22466809
chrY,seq55600,4,GACGCAGCAGACCTTTA,GACGCAGCAGACCTTTACGG,0,chrY:22471164-22471187(+),22471168
chrY,seq55678,4,AGTGCCCAGGAGTGGAA,AGTGCCCAGGAGTGGAATGG,0,chrY:28795346-28795369(+),28795350
chrY,seq55678,4,AGTGCCCAGGAGTGGAA,AGTGCCCAGGAGTGGAATGG,0,chrY:28804733-28804756(+),28804737
chrY,seq55678,4,AGTGCCCAGGAGTGGAA,AGTGCCCAGGAGTGGAATGG,0,chrY:28813322-28813345(+),28813326
chrY,seq55678,4,AGTGCCCAGGAGTGGAA,AGTGCCCAGGAGTGGAATGG,0,chrY:28817158-28817181(+),28817162
chrY,seq55697,4,AGTAAACGGACAAGAGA,AGTAAACGGACAAGAGAAGG,0,chrY:20870976-20870999(+),20870980
chrY,seq55697,4,AGTAAACGGACAAGAGA,AGTAAACGGACAAGAGAGGG,0,chrY:20861151-20861174(+),20861155
chrY,seq55697,4,TCTCTTGTCCGTTTACT,AGTAAACGGACAAGAGAAGG,16,chrY:20773174-20773197(-),20773178


In [60]:
ta1_names = ['chr', 'seqid', 'count', 'Sequence', 'seq20', 'strand', 'chr_p']
ta1.index.names = ta1_names

## Below is the final output with the counts

In [61]:
ta1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,start
chr,seqid,count,Sequence,seq20,strand,chr_p,Unnamed: 7_level_1
chr1,seq1002,33,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011583-1011606(+),1011587
chr1,seq1002,33,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011613-1011636(+),1011617
chr1,seq1002,33,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011643-1011666(+),1011647
chr1,seq1002,33,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011673-1011696(+),1011677
chr1,seq1002,33,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011881-1011904(+),1011885
chr1,seq1002,33,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011911-1011934(+),1011915
chr1,seq1002,33,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011941-1011964(+),1011945
chr1,seq1002,33,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1011971-1011994(+),1011975
chr1,seq1002,33,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1012001-1012024(+),1012005
chr1,seq1002,33,TACGGGGACTCCGTGGG,TACGGGGACTCCGTGGGGGG,0,chr1:1012031-1012054(+),1012035


In [63]:
ta1.to_excel('/home/user/Desktop/nt_17/true_grouped_17.xlsx',header=True,columns=None)