In [1]:
import pandas as pd, numpy as np, re
from pathlib import Path

In [2]:
in_path = Path('/d/data/plasmo/nat_out/v4')
out_path = in_path / 'utr_by_genes.tsv'
genes_file = in_path / 'full_genes.txt'
snp_file = in_path / 'nat_v3.tsv'

In [3]:
genes_df = pd.read_csv(genes_file, sep='\t')

In [4]:
#Run this section to make the UTR blocks instead.
#there has been a change about how negative strand should be the other way.
#block for checking UTR lengths
def getStart(row):
    p = '(.+?)[.][.](.+?)[(]([+-])[)]'    
    loc_split = row['loc'].split(':')
    position = re.search(p, loc_split[1]) #position is the search object
    gene_start = int(position.group(1).strip().replace(',', ''))
    return gene_start

start_positions = np.sort(genes_df.apply(getStart, axis=1).squeeze().values)

def getEnd(row):
    p = '(.+?)[.][.](.+?)[(]([+-])[)]'    
    loc_split = row['loc'].split(':')
    position = re.search(p, loc_split[1]) #position is the search object
    gene_end = int(position.group(2).strip().replace(',', ''))
    return gene_end

end_positions = np.sort(genes_df.apply(getEnd, axis=1).squeeze().values)


def getUtrLengthPos(start):
    dists = end_positions - start
    negatives = dists[np.where(dists < 0)]
    try:
        res = np.max(negatives, axis=None)
    except:
        res = 0
    return res * -1

def getUtrLengthNeg(end):
    dists = start_positions - end
    positives = dists[np.where(dists > 0)]
    try:
        res = np.min(positives, axis=None)
    except:
        res = end + 2000
    return res

def getLocation(row):
    col_names = ['id', 'loc', 'name', 'symbol']
    p = '(.+?)[.][.](.+?)[(]([+-])[)]'
    utr_max = 500 #include a 500bp UTR from the beginning and the end, if possible
    
    loc_split = row[col_names[1]].split(':')
    chr = loc_split[0]
    position = re.search(p, loc_split[1]) #position is the search object
    gene_start = int(position.group(1).strip().replace(',', ''))
    gene_end = int(position.group(2).strip().replace(',', ''))
    strand = position.group(3).strip()
    
    if strand == '+':
        utr_len = min(utr_max, getUtrLengthPos(gene_start))
        start = max(0, gene_start - utr_len)
        end = gene_start
    elif strand == '-':
        utr_len = min(utr_max, getUtrLengthNeg(gene_end))
        start = gene_end
        end = gene_end + utr_len
        
    return pd.Series([row[col_names[0]], chr, start, end, row[col_names[2]], row[col_names[3]]])

genes_mod_df = genes_df.apply(getLocation, axis=1) #modified version containing decomposed positions
genes_mod_df.columns = ['id', 'chr', 'start', 'stop', 'name', 'symbol']
utr_out_path = in_path / 'formatted_utrs.tsv'
genes_mod_df.to_csv(utr_out_path, sep='\t', index=False)

In [5]:
snps_df = pd.read_csv(snp_file, sep='\t', index_col=0, header=0)
snps_df['POS'] = snps_df['POS'].apply(int)

In [6]:
#reindex the snps_df
dfs = {}
for e in set(snps_df.index):
    tmp = snps_df.loc[e]
    idx = pd.Float64Index(tmp['POS'], dtype='float64')
    tmp.set_index(idx, inplace=True)
    dfs[e] = tmp

In [7]:
#retreives a chunk of the data given the whole thing and one gene
def getChunk(row, snps_df):
    chr = row['chr']
    start = min(row['start'], 0)
    stop = row['stop']
    chunk = snps_df[chr].loc[row['start']:row['stop']]
    chunk['CHROM'] = row['id'] + '_UTR:' + chr
#     chunk['POS'] = range(1, len(list(chunk.index)) + 1) #this is for popnet, but we want to go back to original index
    
    chunk.set_index(['CHROM', 'POS'], inplace=True)
    return chunk

result = genes_mod_df.apply(getChunk, args=(dfs,), axis=1)
result_df = pd.concat(result.tolist())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [8]:
result_df.reset_index(inplace=True)
# # result_df.drop(['index'], axis=1, inplace=True)
result_df['POS'] = result_df['POS'].apply(int)
print(result_df)

                                CHROM     POS PD0459-Cx PD0460-C PD0461-C  \
0       PF3D7_0102100_UTR:Pf3D7_01_v3   94705         T        T        T   
1       PF3D7_0102500_UTR:Pf3D7_01_v3  115936         C        C        C   
2       PF3D7_0102800_UTR:Pf3D7_01_v3  128470         C        C        C   
3       PF3D7_0102800_UTR:Pf3D7_01_v3  128492         G        G        G   
4       PF3D7_0102900_UTR:Pf3D7_01_v3  131236         T        T        T   
...                               ...     ...       ...      ...      ...   
3208  PF3D7_API03600_UTR:Pf3D7_API_v3   10463         G        G        G   
3209  PF3D7_API03600_UTR:Pf3D7_API_v3   10508         T        T        C   
3210  PF3D7_API04000_UTR:Pf3D7_API_v3   13298         A        A        A   
3211         mal_rna_11_UTR:Pf_M76611    2175         T        T        T   
3212          mal_rna_9_UTR:Pf_M76611    1692         G        G        G   

     PD0462-C PD0464-C PD0466-C PD0467-C PD0468-C  ... QE0469-C QE0470-C  \

In [9]:
result_df.to_csv(out_path, sep='\t', index=False)