## This notebook will remove misc items from ANR table

In [1]:
import os
import pandas as pd
import statistics as st
import matplotlib.pyplot as plt

### Remove triple mutants
* use the df as the left
* merge all the other ones into this one.

In [2]:
exclude = ['_wt','_sy']

#read in full dataframe, rename column names
cp5_q30 = (pd
           .read_csv('/home/rtu/random/Cindy/ANR/tsv/CP5_q30_lib/main_variants_counts.tsv', sep = '\t')
           .rename(columns = {'Unnamed: 0':'mut','count':'CP5_q30_lib_count'}))

#take out any items that are in exclude
mut = cp5_q30.query('mut not in @exclude')

# add a column called mut2 with the number of mutations
mut['mut2'] = mut['mut'].apply(lambda x: len(x.split(',')))

left = pd.concat([cp5_q30.query('mut in @exclude')
                  ,(mut
                    .query('mut2<3') # look for mutations less than 3
                    .drop(columns = ['mut2'])
                   )
                 ])
left

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Unnamed: 0,mut,CP5_q30_lib_count
0,_wt,201290
1,c.23A>G (p.Tyr8Cys),1266
2,c.2G>C (p.Gly1Ala),1252
3,c.4G>C (p.Glu2Gln),1188
4,c.3G>A (p.=),1080
...,...,...
5300,"c.1G>A (p.Gly1Arg), c.38C>T (p.Pro13Leu)",1
5301,"c.27C>G (p.=), c.45C>A (p.Tyr15Ter)",1
5304,"c.8T>A (p.Leu3His), c.60C>A (p.Ser20Arg)",1
5306,"c.19G>A (p.Val7Ile), c.63C>G (p.Asp21Glu)",1


### Iterate through paths with `_lib` and extract `main_synonymous_counts.tsv`

In [5]:
os.chdir('/home/rtu/random/Cindy/ANR/191016/tsv')

anrDict = dict()
for i in os.listdir():
    if i.endswith('_lib'):
        os.chdir(f'/home/rtu/random/Cindy/ANR/191016/tsv/{i}')
        df = pd.read_csv('main_variants_counts.tsv',sep = '\t').rename(columns = {'Unnamed: 0':'mut','count':f'{i}_count'})
        anrDict.update({i:df})
        os.chdir('..')

#### Concatenate all the files together.

In [6]:
df = left.copy()
for i in anrDict.values():
    df = pd.merge(left = df,
                  right = i,
                  on = 'mut',
                  how = 'left',
                  )
df

Unnamed: 0,mut,CP5_q30_lib_count,3Dano_lib_count,3dmso_lib_count,4Dano_lib_count,4dmso_lib_count,5Dano_lib_count,5dmso_lib_count
0,_wt,201290,254186.0,370684.0,314164.0,883024.0,1042908.0,1376413.0
1,c.23A>G (p.Tyr8Cys),1266,144.0,253.0,136.0,240.0,275.0,486.0
2,c.2G>C (p.Gly1Ala),1252,50.0,60.0,84.0,91.0,157.0,161.0
3,c.4G>C (p.Glu2Gln),1188,20.0,89.0,25.0,84.0,95.0,90.0
4,c.3G>A (p.=),1080,152.0,263.0,207.0,299.0,327.0,528.0
...,...,...,...,...,...,...,...,...
3794,"c.1G>A (p.Gly1Arg), c.38C>T (p.Pro13Leu)",1,,1.0,,,,
3795,"c.27C>G (p.=), c.45C>A (p.Tyr15Ter)",1,,,,,,
3796,"c.8T>A (p.Leu3His), c.60C>A (p.Ser20Arg)",1,,,,,,
3797,"c.19G>A (p.Val7Ile), c.63C>G (p.Asp21Glu)",1,,,,,,


### Seperate and rename the `mut` column into wildtype, position, and mutation
* wiltype = wt
* position = ps
* mutation = mt
* single mutation = 1
* double mutation = 2
example: c.4G>C (p.Glu2Gln) <br>
strip 'c.', 4 is pos G wt, C is mt, bracket is AA

In [7]:
# split mutation column and get the number of mutations
df['lenMut'] = df['mut'].apply(lambda x: len(x.split(', ')))
df['listMut'] = df['mut'].apply(lambda x: x.replace('p.','').split(', '))

In [8]:
df # see dataframe

Unnamed: 0,mut,CP5_q30_lib_count,3Dano_lib_count,3dmso_lib_count,4Dano_lib_count,4dmso_lib_count,5Dano_lib_count,5dmso_lib_count,lenMut,listMut
0,_wt,201290,254186.0,370684.0,314164.0,883024.0,1042908.0,1376413.0,1,[_wt]
1,c.23A>G (p.Tyr8Cys),1266,144.0,253.0,136.0,240.0,275.0,486.0,1,[c.23A>G (Tyr8Cys)]
2,c.2G>C (p.Gly1Ala),1252,50.0,60.0,84.0,91.0,157.0,161.0,1,[c.2G>C (Gly1Ala)]
3,c.4G>C (p.Glu2Gln),1188,20.0,89.0,25.0,84.0,95.0,90.0,1,[c.4G>C (Glu2Gln)]
4,c.3G>A (p.=),1080,152.0,263.0,207.0,299.0,327.0,528.0,1,[c.3G>A (=)]
...,...,...,...,...,...,...,...,...,...,...
3794,"c.1G>A (p.Gly1Arg), c.38C>T (p.Pro13Leu)",1,,1.0,,,,,2,"[c.1G>A (Gly1Arg), c.38C>T (Pro13Leu)]"
3795,"c.27C>G (p.=), c.45C>A (p.Tyr15Ter)",1,,,,,,,2,"[c.27C>G (=), c.45C>A (Tyr15Ter)]"
3796,"c.8T>A (p.Leu3His), c.60C>A (p.Ser20Arg)",1,,,,,,,2,"[c.8T>A (Leu3His), c.60C>A (Ser20Arg)]"
3797,"c.19G>A (p.Val7Ile), c.63C>G (p.Asp21Glu)",1,,,,,,,2,"[c.19G>A (Val7Ile), c.63C>G (Asp21Glu)]"


In [9]:
# amino acid code plus some
aaCode = {'Ala':'A','Cys':'C','Asp':'D','Glu':'E','Phe':'F',
          'Gly':'G','His':'H','Ile':'I','Lys':'K','Leu':'L',
          'Met':'M','Asn':'N','Pro':'P','Gln':'Q','Arg':'R',
          'Ser':'S','Thr':'T','Val':'V','Trp':'W','Tyr':'Y',
          '_wt':'wt','_sy':'sy','_w':'wt','_s':'sy','Ter':'*'}

#### Write a function for parsing the list of strings, returns a list of tuples
* If it has double mutation, returns a list of tuple with index 0 and 1 corresponding to mutation 1 and 2.

In [18]:
def getStartEndAATuple(x):
    '''
    x is iterable list
    returns list of tuples
    '''
    aList = []
    for i in x:
        
        v = i.split('(')
        
        for value in v:
            value = value.strip(')')
            
            try:
                startVal = value[:3] # get triple AA wt
                endVal = value[-3:]  # get triple AA mut
                posVal = value.strip(startVal).strip(endVal) # get position of mut
                startVal = aaCode[startVal] #translate
                endVal = aaCode[endVal]
            except:
                startVal = 'NaN'
                posVal = 'NaN' #exception for no number
                endVal = 'NaN'


        mytup = (startVal, posVal, endVal)
        aList.append(mytup)

    return(aList)

In [19]:
# Test
df['listMut'].apply(lambda x: getStartEndAATuple(x))

0                             [(wt, , wt)]
1                              [(Y, 8, C)]
2                              [(G, 1, A)]
3                              [(E, 2, Q)]
4                        [(NaN, NaN, NaN)]
                       ...                
3794               [(G, 1, R), (P, 13, L)]
3795         [(NaN, NaN, NaN), (Y, 15, *)]
3796               [(L, 3, H), (S, 20, R)]
3797               [(V, 7, I), (D, 21, E)]
3798    [(NaN, NaN, NaN), (NaN, NaN, NaN)]
Name: listMut, Length: 3799, dtype: object

In [20]:
# Create columns with wt/ps/mt with translation
df['wt1'] = df['listMut'].apply(lambda x: getStartEndAATuple(x)[0][0])
df['ps1'] = df['listMut'].apply(lambda x: getStartEndAATuple(x)[0][1])
df['mt1'] = df['listMut'].apply(lambda x: getStartEndAATuple(x)[0][2])

In [26]:
# write function to skip the mutations that are only single, return empty string
def skipSingleMutations(alist):
    '''
    Takes a list, looks at the second mutation tuple
    '''
    try:
        second = alist[1]
    except:
        second = ('NaN','NaN','NaN')
    return(second)

In [27]:
# Create second set of columns for double mutations
df['wt2'] = df['listMut'].apply(lambda x: skipSingleMutations(getStartEndAATuple(x))[0])
df['ps2'] = df['listMut'].apply(lambda x: skipSingleMutations(getStartEndAATuple(x))[1])
df['mt2'] = df['listMut'].apply(lambda x: skipSingleMutations(getStartEndAATuple(x))[2])

In [29]:
df['ps1'] = df.ps1.replace(to_replace=['',' '],value = 'NaN') # replace the empty string with None

### Add in columns for base pair change

In [30]:
df.head()

Unnamed: 0,mut,CP5_q30_lib_count,3Dano_lib_count,3dmso_lib_count,4Dano_lib_count,4dmso_lib_count,5Dano_lib_count,5dmso_lib_count,lenMut,listMut,wt1,ps1,mt1,wt2,ps2,mt2
0,_wt,201290,254186.0,370684.0,314164.0,883024.0,1042908.0,1376413.0,1,[_wt],wt,,wt,,,
1,c.23A>G (p.Tyr8Cys),1266,144.0,253.0,136.0,240.0,275.0,486.0,1,[c.23A>G (Tyr8Cys)],Y,8.0,C,,,
2,c.2G>C (p.Gly1Ala),1252,50.0,60.0,84.0,91.0,157.0,161.0,1,[c.2G>C (Gly1Ala)],G,1.0,A,,,
3,c.4G>C (p.Glu2Gln),1188,20.0,89.0,25.0,84.0,95.0,90.0,1,[c.4G>C (Glu2Gln)],E,2.0,Q,,,
4,c.3G>A (p.=),1080,152.0,263.0,207.0,299.0,327.0,528.0,1,[c.3G>A (=)],,,,,,


In [54]:
def bp(alist):
    '''
    takes a list, at each index, splits and return wt, pos, mutation a a tuple
    '''
    storageList = list()
    for x in alist:
        v = x.split(' (')[0].strip('c.') # split by parenthesis, remove 'c.'
        if v == '_wt':
            startV,endV,posV = 'wt','wt','NaN'
        else:
            try:
                startV =v[-3]
                endV = v[-1]
                posV =  v.strip(f'{startV}>{endV}')

            except:
                startV,endV,posV = 'NaN','NaN','NaN'
                
        mytup = (startV, posV, endV)
        storageList.append(mytup)
    return(storageList)

In [55]:
df['listMut'].apply(lambda x: bp(x)[0])

0       (wt, NaN, wt)
1          (A, 23, G)
2           (G, 2, C)
3           (G, 4, C)
4           (G, 3, A)
            ...      
3794        (G, 1, A)
3795       (C, 27, G)
3796        (T, 8, A)
3797       (G, 19, A)
3798       (C, 27, A)
Name: listMut, Length: 3799, dtype: object

In [56]:
df['listMut'].apply(lambda x: skipSingleMutations(bp(x)))

0       (NaN, NaN, NaN)
1       (NaN, NaN, NaN)
2       (NaN, NaN, NaN)
3       (NaN, NaN, NaN)
4       (NaN, NaN, NaN)
             ...       
3794         (C, 38, T)
3795         (C, 45, A)
3796         (C, 60, A)
3797         (C, 63, G)
3798         (C, 54, T)
Name: listMut, Length: 3799, dtype: object

In [57]:
# create columns
df['wt1_b'] = df['listMut'].apply(lambda x: bp(x)[0][0]) # wildtype variant, mutation1
df['ps1_b'] = df['listMut'].apply(lambda x: bp(x)[0][1]) # position variant, mutation1
df['mt1_b'] = df['listMut'].apply(lambda x: bp(x)[0][2]) # mutation variant, mutation1

df['wt2_b'] = df['listMut'].apply(lambda x: skipSingleMutations(bp(x))[0]) # mutation2 wildtype base
df['ps2_b'] = df['listMut'].apply(lambda x: skipSingleMutations(bp(x))[1]) # mutation2 position of base
df['mt2_b'] = df['listMut'].apply(lambda x: skipSingleMutations(bp(x))[2]) # mutation2 mutation base

In [59]:
# Final Dataframe
(
    df
    .sort_values(by = 'lenMut',ascending = False)
    .sort_values(by = 'CP5_q30_lib_count',ascending = False)
    .drop(columns = ['lenMut','listMut'])
    .fillna(value = 'NaN')
)

Unnamed: 0,mut,CP5_q30_lib_count,3Dano_lib_count,3dmso_lib_count,4Dano_lib_count,4dmso_lib_count,5Dano_lib_count,5dmso_lib_count,wt1,ps1,mt1,wt2,ps2,mt2,wt1_b,ps1_b,mt1_b,wt2_b,ps2_b,mt2_b
0,_wt,201290,254186,370684,314164,883024,1.04291e+06,1.37641e+06,wt,,wt,,,,wt,,wt,,,
1,c.23A>G (p.Tyr8Cys),1266,144,253,136,240,275,486,Y,8,C,,,,A,23,G,,,
2,c.2G>C (p.Gly1Ala),1252,50,60,84,91,157,161,G,1,A,,,,G,2,C,,,
3,c.4G>C (p.Glu2Gln),1188,20,89,25,84,95,90,E,2,Q,,,,G,4,C,,,
4,c.3G>A (p.=),1080,152,263,207,299,327,528,,,,,,,G,3,A,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3792,"c.1G>T (p.Gly1Trp), c.14A>G (p.Glu5Gly)",1,,,,,,,G,1,W,E,5,G,G,1,T,A,14,G
3793,"c.33C>T (p.=), c.57T>G (p.His19Gln)",1,,,,,,,,,,H,19,Q,C,33,T,T,57,G
3794,"c.1G>A (p.Gly1Arg), c.38C>T (p.Pro13Leu)",1,,1,,,,,G,1,R,P,13,L,G,1,A,C,38,T
3795,"c.27C>G (p.=), c.45C>A (p.Tyr15Ter)",1,,,,,,,,,,Y,15,*,C,27,G,C,45,A


In [60]:
# write to tsv
# Final Dataframe
(
    df
    .sort_values(by = 'lenMut',ascending = False)
    .sort_values(by = 'CP5_q30_lib_count',ascending = False)
    .drop(columns = ['lenMut','listMut'])
    .fillna(value = 'NaN')
    .to_csv('~/random/Cindy/ANR/main_variants_counts_combined.tsv',sep = '\t',header = True, index = False)
)