In [1]:
import pandas as pd, numpy as np
from statistics import mode, StatisticsError
from collections import defaultdict
from operator import itemgetter

In [2]:
train_data = pd.read_csv("train.csv")
train_update = pd.read_csv("train_updates_20220929.csv")
nrow_update = train_update.shape[0]
nrow_train = train_data.shape[0]
update_proseq = train_update.iloc[:,1]

# By judging whether the protein sequence is NaN, find the rows to be deleted
row_ind_delete = list(train_update.iloc[list(update_proseq!=update_proseq),0])

# Find the rows that are to be modified
row_ind_replace = list(train_update.iloc[list(update_proseq==update_proseq),0])

# Now construct a list of rows to substitude the target rows
substitution = train_update.iloc[list(update_proseq==update_proseq),:]
substitution = [list(substitution.iloc[i,:]) for i in range(substitution.shape[0])]

# Now find the rows that should be kept, instead be being deleted
row_ind_remain = list(range(nrow_train))
row_ind_remain = list(set(row_ind_remain)-set(row_ind_delete))

# Finally, acquire the modified train data set
modified_train_data = train_data
for row in substitution:
    row_ind = row[0]
    modified_train_data.iloc[row_ind,:] = row
modified_train_data = modified_train_data.iloc[row_ind_remain,:]



In [4]:
train = modified_train_data
print('Train shape:', train.shape )
## Visually show the data is fixed. Interesting that there are still the same number of unique data sources after the "bad data" was removed.
print(train['pH'].min(), train['pH'].max(), train['data_source'].nunique())
train.head()

Train shape: (28981, 5)
1.99 11.0 324


Unnamed: 0,seq_id,protein_sequence,pH,data_source,tm
0,0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,doi.org/10.1038/s41592-020-0801-4,75.7
1,1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,doi.org/10.1038/s41592-020-0801-4,50.5
2,2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,doi.org/10.1038/s41592-020-0801-4,40.5
3,3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,doi.org/10.1038/s41592-020-0801-4,47.2
4,4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,doi.org/10.1038/s41592-020-0801-4,49.5


In [5]:
train['x'] = train.protein_sequence.str.len()
vc = train.x.value_counts()
vc.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['x'] = train.protein_sequence.str.len()


164    748
231    318
455    245
155    243
148    241
Name: x, dtype: int64

In [6]:
# INSERTION DELETION THRESHOLD
D_THRESHOLD = 1
# MIN GROUP SIZE
MIN_GROUP_SIZE = 5

def max_item_count(seq):
    d = defaultdict(int)
    for item in seq:
        d[item] += 1
    return max(d.items(), key=itemgetter(1))

def get_wildtype(proteins, is_retry=False):
    if not is_retry:
        ## try to get the mode, the simpler algorithm
        wildtype = []
        try:
            for i in range(len(proteins.iloc[0])):
                wildtype.append(mode([p[i] for p in proteins]))
            return ''.join(wildtype)
        except StatisticsError:
            pass
    ## Either failed mode above, or this is a retry because the resulting wildtype didn't actually fit enough proteins
    ##
    ## Two sequences with single mutation from the same wildtype are no more than 2 points different.
    ## Therefore, at least 1/3rd length consecutive string must match. Find max counts of starts, middles, and ends
    ## This technically isn't a guaranteed or precise algorithm, but it is fast and effective,
    ##   based on comparison with more precise grouping methods.
    k = len(proteins.iloc[0])//3
    starts = [p[:k] for p in proteins]
    middles = [p[k:2*k] for p in proteins]
    ends = [p[-k:] for p in proteins]
    ## get the most common substring, and the count of that substring
    start = max_item_count(starts)
    middle = max_item_count(middles)
    end = max_item_count(ends)
    ## reduce the proteins to the ones that match the most common substring
    if (start[1] >= middle[1]) and (start[1] >= end[1]) and (start[1] >= MIN_GROUP_SIZE):
        proteins = [p for p in proteins if p[:k] == start[0]]
        assert(start[1] == len(proteins))
    elif (middle[1] >= end[1]) and (middle[1] >= MIN_GROUP_SIZE):
        proteins = [p for p in proteins if p[k:2*k] == middle[0]]
        assert(middle[1] == len(proteins))
    elif end[1] >= MIN_GROUP_SIZE:
        proteins = [p for p in proteins if p[-k:] == end[0]]
        assert(end[1] == len(proteins))
    else:
        return ''
    ## use the reduced list to find the entire wildtype
    wildtype = []
    try:
        for i in range(len(proteins[0])):
            wildtype.append(mode([p[i] for p in proteins]))
        return ''.join(wildtype)
    except StatisticsError:
        return ""

In [7]:
train['group'] = -1
train['wildtype'] = ''
grp = 0

for k in range(len(vc)):
    if vc.iloc[k] < MIN_GROUP_SIZE:
        break
    c = vc.index[k]
    #print(f'rows={vc.iloc[k]}, k:{k}, protein length:{c}')
    is_retry = False
    # SUBSET OF TRAIN DATA WITH SAME PROTEIN LENGTH (not enough deletions to matter for step 1, finding the wildtype)
    tmp = train.loc[(train.x==c)&(train.group==-1)]

    ## It is possible that the same length protein string might have multiple wildtypes in the raw data, keep searching until we've found all of them
    while len(tmp) >= MIN_GROUP_SIZE:
        if len(tmp)<=1: break
        # Ignore Levenstein distance, which is overkill
        # Directly attempt to find wildtype
        # Drop duplicates for wildtype guesstimation
        proteins = tmp.protein_sequence.drop_duplicates()

        # Create most likely wildtype
        wildtype = get_wildtype(proteins, is_retry=is_retry)
        if wildtype == '':
            break

        # SUBSET OF TRAIN DATA WITH SAME PROTEIN LENGTH PLUS MINUS D_THRESHOLD
        tmp = train.loc[(train.x>=c-D_THRESHOLD)&(train.x<=c+D_THRESHOLD)&(train.group==-1)]
        for idx in tmp.index:
            p = train.loc[idx, 'protein_sequence']
            half = c//2
            ## Use fast method to guess that it is only a single point mutation away. Later we double check and actually count number of mutations.
            if (wildtype[:half] == p[:half]) or (wildtype[-half:] == p[-half:]):
                train.loc[idx,'group'] = grp
                train.loc[idx,'wildtype'] = wildtype
        if len(train.loc[train.group==grp]) >= MIN_GROUP_SIZE:
            #print(f"{train.loc[(train.group==grp)].shape[0]}: Group {grp} results")
            grp += 1
            is_retry = False
        else:
            train.loc[idx,'group'] = -1
            train.loc[idx,'wildtype'] = ''
            ## to avoid an infinite loop, break out if we've already failed last time
            if is_retry:
                break
            is_retry = True

        # Get ready for next loop
        tmp = train.loc[(train.x==c)&(train.group==-1)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['group'] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['wildtype'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value ins

In [8]:
def argsort(seq, reverse=False):
    # http://stackoverflow.com/questions/3071415/efficient-method-to-calculate-the-rank-vector-of-a-list-in-python
    return sorted(range(len(seq)), key=seq.__getitem__, reverse=reverse)

groups = [0] * grp
for k in range(grp):
    groups[k] = len(train.loc[train.group==k])

groupCount = 0
rowCount = 0
for k in argsort(groups, reverse=True):
    if train.loc[train.group==k].shape[0] == 0:
        continue
    proteins = train.loc[train.group==k, "protein_sequence"]
    wildtype = train.loc[train.group==k, "wildtype"].values[0]

    ## no insertions in the dataset, that I've found.
    ## Handle deletions by adding a '-' symbol in the correct place
    for i in range(len(proteins)):
        if len(proteins.iloc[i]) < len(proteins.iloc[0]):
            if proteins.iloc[i] == wildtype[:-1]:
                proteins.iloc[i] = proteins.iloc[i] + "-"
            else:
                for j in range(len(proteins.iloc[i])):
                    if proteins.iloc[i][j] != wildtype[j]:
                        proteins.iloc[i] = proteins.iloc[i][:j-1] + "-" + proteins.iloc[i][j:]
                        break
        assert(len(proteins.iloc[i]) == len(proteins.iloc[0]))

    ## In very rare cases, the simplified logic to group proteins will group a protein that is NOT a single mutation away from the wildtype.
    ## Ungroup those proteins.
    ungroup = []
    for p in proteins:
        mut = 0
        for j in range(len(wildtype)):
            if p[j] != wildtype[j]:
                mut += 1
        if mut > 1:
            if p not in ungroup:
                ungroup.append(p)
    for p in ungroup:
        train.loc[train.protein_sequence==p, 'group'] = -1
        train.loc[train.protein_sequence==p, 'wildtype'] = ''
    ## Remove entire group if it is now smaller than the min group size
    if train.loc[train.group==k].shape[0] < MIN_GROUP_SIZE:
        train.loc[train.group==k, 'wildtype'] = ''
        train.loc[train.group==k, 'group'] = -1
        continue

    ## Print a line for every group, and a bunch of stats for the first few groups
    #print(f'{k}: {train.loc[train.group==k].shape[0]}')
    #if groupCount < 5:
        #display( train.loc[train.group==k] )
        #for c in train.columns:
            #print(c, train.loc[train.group==k, c].nunique() + train.loc[train.group==k, c].isnull().values.any())
        #print(wildtype)
        #print("")
    groupCount += 1
    rowCount += train.loc[train.group==k].shape[0]

print(groupCount, rowCount)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

76 4171


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

In [10]:
## Re-number groups from largest to smallest
groups = [0] * grp
for k in range(grp):
    groups[k] = len(train.loc[train.group==k])

n = 10000
for k in argsort(groups, reverse=True):
    train.loc[train.group==k, "group"] = n
    n += 1

train.loc[train.group>=10000, "group"] = train.loc[train.group>=10000, "group"] - 10000
train.loc[train.group==-1, "group"] = 1000
train = train.sort_values(axis=0, by=['group'], kind='mergesort').reset_index(drop=True)
train.loc[train.group==1000, "group"] = -1

# train = train.drop('x',axis=1)
train_wildtype_groups = train.loc[train.wildtype != '']
train_no_wildtype = train.loc[train.wildtype == '']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

In [11]:
train_wildtype_groups.to_csv('train_wildtype_groups.csv',index=False)
train_wildtype_groups.head()

Unnamed: 0,seq_id,protein_sequence,pH,data_source,tm,x,group,wildtype
0,18020,MNAFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,2.0,10.1021/bi00535a054,38.1,164,0,MNIFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...
1,18021,MNAFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,4.2,,53.3,164,0,MNIFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...
2,18022,MNAFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,2.0,10.1038/334406a0,38.1,164,0,MNIFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...
3,18023,MNAFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,6.5,10.1038/334406a0,62.9,164,0,MNIFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...
4,18060,MNCFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,2.0,,41.9,164,0,MNIFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...


In [17]:
# To see wether there are sequences of different lengths in one group
data = pd.read_csv('train_wildtype_groups.csv')
data["protein_length"] = data.protein_sequence.str.len()
groups = data.group

for group_ind in groups.unique():
    one_group = np.array(data.protein_length)[groups==group_ind]
    
    lengths = list(set(one_group))
    if len(lengths) >= 2:
        print(group_ind)
        print([(list(one_group).count(i),i) for i in lengths])

4
[(1, 154), (179, 155)]
23
[(1, 267), (52, 268)]
36
[(1, 158), (27, 159)]
