## This notebook will remove misc items from ANR table

In [1]:
import os
import pandas as pd
import statistics as st
import matplotlib.pyplot as plt

### Remove triple mutants
* use the df as the left
* merge all the other ones into this one.

In [2]:
exclude = ['_wt','_sy']

#read in full dataframe, rename column names
cp5_q30 = (pd
           .read_csv('/home/rtu/random/Cindy/ANR/tsv/CP5_q30_lib/main_synonymous_counts.tsv', sep = '\t')
           .rename(columns = {'Unnamed: 0':'mut','count':'CP5_q30_lib_count'}))

#take out any items that are in exclude
mut = cp5_q30.query('mut not in @exclude')

# add a column called mut2 with the number of mutations
mut['mut2'] = mut['mut'].apply(lambda x: len(x.split(',')))

left = pd.concat([cp5_q30.query('mut in @exclude')
                  ,(mut
                    .query('mut2<3') # look for mutations less than 3
                    .drop(columns = ['mut2'])
                   )
                 ])
left

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Unnamed: 0,mut,CP5_q30_lib_count
0,_wt,201290
1,_sy,23820
2,p.Asp4Glu,1553
3,p.His19Gln,1385
4,p.Tyr8Cys,1296
...,...,...
2930,"p.Glu2Asp, p.Tyr8Cys",1
2931,"p.Leu3Arg, p.Leu10Pro",1
2932,"p.Leu3Phe, p.Leu10His",1
2933,"p.Gly1Arg, p.Asp4Glu",1


### Iterate through paths with `_lib` and extract `main_synonymous_counts.tsv`

In [3]:
os.chdir('/home/rtu/random/Cindy/ANR/191016/tsv')

anrDict = dict()
for i in os.listdir():
    if i.endswith('_lib'):
        os.chdir(f'/home/rtu/random/Cindy/ANR/191016/tsv/{i}')
        df = pd.read_csv('main_synonymous_counts.tsv',sep = '\t').rename(columns = {'Unnamed: 0':'mut','count':f'{i}_count'})
        anrDict.update({i:df})
        os.chdir('..')

#### Concatenate all the files together.

In [4]:
df = left.copy()
for i in anrDict.values():
    df = pd.merge(left = df,
                  right = i,
                  on = 'mut',
                  how = 'left',
                  )
df

Unnamed: 0,mut,CP5_q30_lib_count,3Dano_lib_count,3dmso_lib_count,4Dano_lib_count,4dmso_lib_count,5Dano_lib_count,5dmso_lib_count
0,_wt,201290,254186.0,370684.0,314164.0,883024.0,1042908.0,1376413.0
1,_sy,23820,9633.0,18269.0,13088.0,23297.0,11719.0,20164.0
2,p.Asp4Glu,1553,1027.0,6795.0,2424.0,1903.0,17752.0,1227.0
3,p.His19Gln,1385,68.0,88.0,74.0,207.0,251.0,306.0
4,p.Tyr8Cys,1296,148.0,262.0,137.0,244.0,279.0,492.0
...,...,...,...,...,...,...,...,...
2271,"p.Glu2Asp, p.Tyr8Cys",1,,,,,,
2272,"p.Leu3Arg, p.Leu10Pro",1,,,,,,
2273,"p.Leu3Phe, p.Leu10His",1,,,,,,
2274,"p.Gly1Arg, p.Asp4Glu",1,,2.0,,,10.0,2.0


### Seperate and rename the `mut` column into wildtype, position, and mutation
* wiltype = wt
* position = ps
* mutation = mt
* single mutation = 1
* double mutation = 2

In [5]:
# split mutation column and get the number of mutations
df['lenMut'] = df['mut'].apply(lambda x: len(x.split(',')))
df['listMut'] = df['mut'].apply(lambda x: x.replace('p.','').split(', '))

In [7]:
df # see dataframe

Unnamed: 0,mut,CP5_q30_lib_count,3Dano_lib_count,3dmso_lib_count,4Dano_lib_count,4dmso_lib_count,5Dano_lib_count,5dmso_lib_count,lenMut,listMut
0,_wt,201290,254186.0,370684.0,314164.0,883024.0,1042908.0,1376413.0,1,[_wt]
1,_sy,23820,9633.0,18269.0,13088.0,23297.0,11719.0,20164.0,1,[_sy]
2,p.Asp4Glu,1553,1027.0,6795.0,2424.0,1903.0,17752.0,1227.0,1,[Asp4Glu]
3,p.His19Gln,1385,68.0,88.0,74.0,207.0,251.0,306.0,1,[His19Gln]
4,p.Tyr8Cys,1296,148.0,262.0,137.0,244.0,279.0,492.0,1,[Tyr8Cys]
...,...,...,...,...,...,...,...,...,...,...
2271,"p.Glu2Asp, p.Tyr8Cys",1,,,,,,,2,"[Glu2Asp, Tyr8Cys]"
2272,"p.Leu3Arg, p.Leu10Pro",1,,,,,,,2,"[Leu3Arg, Leu10Pro]"
2273,"p.Leu3Phe, p.Leu10His",1,,,,,,,2,"[Leu3Phe, Leu10His]"
2274,"p.Gly1Arg, p.Asp4Glu",1,,2.0,,,10.0,2.0,2,"[Gly1Arg, Asp4Glu]"


In [10]:
# amino acid code plus some
aaCode = {'Ala':'A','Cys':'C','Asp':'D','Glu':'E','Phe':'F',
          'Gly':'G','His':'H','Ile':'I','Lys':'K','Leu':'L',
          'Met':'M','Asn':'N','Pro':'P','Gln':'Q','Arg':'R',
          'Ser':'S','Thr':'T','Val':'V','Trp':'W','Tyr':'Y',
          '_wt':'wt','_sy':'sy','_w':'wt','_s':'sy','Ter':'*'}

#### Write a function for parsing the list of strings, returns a list of tuples
* If it has double mutation, returns a list of tuple with index 0 and 1 corresponding to mutation 1 and 2.

In [22]:
def getStartEndAATuple(x):
    '''
    x is iterable list
    returns list of tuples
    '''
    aList = []
    for i in x:
        startVal = i[:3] # get triple AA wt
        endVal = i[-3:]  # get triple AA mut
        try:
            posVal = i.strip(startVal).strip(endVal) # get position of mut
        except:
            posVal = None #exception for no number
        startVal = aaCode[startVal] #translate
        endVal = aaCode[endVal]

        mytup = (startVal, posVal, endVal)
        aList.append(mytup)

    return(aList)

In [23]:
# Test
df['listMut'].apply(lambda x: getStartEndAATuple(x))

0                  [(wt, , wt)]
1                  [(sy, , sy)]
2                   [(D, 4, E)]
3                  [(H, 19, Q)]
4                   [(Y, 8, C)]
                 ...           
2271     [(E, 2, D), (Y, 8, C)]
2272    [(L, 3, R), (L, 10, P)]
2273    [(L, 3, F), (L, 10, H)]
2274     [(G, 1, R), (D, 4, E)]
2275     [(E, 2, D), (L, 6, H)]
Name: listMut, Length: 2276, dtype: object

In [32]:
# Create columns with wt/ps/mt with translation
df['wt1'] = df['listMut'].apply(lambda x: getStartEndAATuple(x)[0][0])
df['ps1'] = df['listMut'].apply(lambda x: getStartEndAATuple(x)[0][1])
df['mt1'] = df['listMut'].apply(lambda x: getStartEndAATuple(x)[0][2])

In [47]:
# write function to skip the mutations that are only single, return empty string
def skipSingleMutations(alist):
    '''
    Takes a list, looks at the second mutation tuple
    '''
    try:
        second = alist[1]
    except:
        second = (None,None,None)
    return(second)

In [48]:
# Create second set of columns for double mutations
df['wt2'] = df['listMut'].apply(lambda x: skipSingleMutations(getStartEndAATuple(x))[0])
df['ps2'] = df['listMut'].apply(lambda x: skipSingleMutations(getStartEndAATuple(x))[1])
df['mt2'] = df['listMut'].apply(lambda x: skipSingleMutations(getStartEndAATuple(x))[2])

In [56]:
df['ps1'] = df.ps1.replace(to_replace=['',' '],value = 'NaN') # replace the empty string with None

In [57]:
# Final Dataframe
(
    df
    .sort_values(by = 'lenMut',ascending = False)
    .sort_values(by = 'CP5_q30_lib_count',ascending = False)
    .drop(columns = ['lenMut','listMut'])
    .fillna(value = 'NaN')
)

Unnamed: 0,mut,CP5_q30_lib_count,3Dano_lib_count,3dmso_lib_count,4Dano_lib_count,4dmso_lib_count,5Dano_lib_count,5dmso_lib_count,wt1,ps1,mt1,wt2,ps2,mt2
0,_wt,201290,254186,370684,314164,883024,1.04291e+06,1.37641e+06,wt,,wt,,,
1,_sy,23820,9633,18269,13088,23297,11719,20164,sy,,sy,,,
2,p.Asp4Glu,1553,1027,6795,2424,1903,17752,1227,D,4,E,,,
3,p.His19Gln,1385,68,88,74,207,251,306,H,19,Q,,,
4,p.Tyr8Cys,1296,148,262,137,244,279,492,Y,8,C,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1975,"p.Glu5Gly, p.Leu6Arg",1,,,,,,,E,5,G,L,6,R
1974,"p.Leu3His, p.Ile18Val",1,,1,,,,1,L,3,H,I,18,V
1973,"p.Leu6Arg, p.Ser20Asn",1,,,,,,,L,6,R,S,20,N
1972,"p.Tyr15Cys, p.Asp21Ala",1,1,,,,,,Y,15,C,D,21,A


In [58]:
# write to tsv
# Final Dataframe
(
    df
    .sort_values(by = 'lenMut',ascending = False)
    .sort_values(by = 'CP5_q30_lib_count',ascending = False)
    .drop(columns = ['lenMut','listMut'])
    .fillna(value = 'NaN')
    .to_csv('~/random/Cindy/ANR/main_synonymous_counts_combined.tsv',sep = '\t',header = True, index = False)
)