In [1]:
import pandas as pd, numpy as np, re
from pathlib import Path

In [2]:
in_path = Path('/d/data/plasmo/nat_out/v4')
out_path = in_path / 'snps_by_genes.tsv'
genes_file = in_path / 'full_genes.txt'
snp_file = in_path / 'nat_v3.tsv'

In [4]:
genes_df = pd.read_csv(genes_file, sep='\t')

In [5]:
#we need a function to decompose the gene location
#we expect the column names to be id, loc, name, and symbol
def getLocation(row):
    col_names = ['id', 'loc', 'name', 'symbol']
    p = '(.+?)[.][.](.+?)[(][+-][)]'
    
    loc_split = row[col_names[1]].split(':')
    chr = loc_split[0]
    position = re.search(p, loc_split[1])
    start = int(position.group(1).strip().replace(',', ''))
    end = int(position.group(2).strip().replace(',', ''))
    return pd.Series([row[col_names[0]], chr, start, end, row[col_names[2]], row[col_names[3]]])

genes_mod_df = genes_df.apply(getLocation, axis=1) #modified version containing decomposed positions
genes_mod_df.columns = ['id', 'chr', 'start', 'stop', 'name', 'symbol']
genes_out_file = in_path / 'formatted_genes.tsv'
genes_mod_df.to_csv(genes_out_file, sep='\t', index=False)


In [6]:
snps_df = pd.read_csv(snp_file, sep='\t', index_col=0, header=0)
snps_df['POS'] = snps_df['POS'].apply(int)

In [7]:
#reindex the snps_df
dfs = {}
for e in set(snps_df.index):
    tmp = snps_df.loc[e]
    idx = pd.Float64Index(tmp['POS'], dtype='float64')
    tmp.set_index(idx, inplace=True)
    dfs[e] = tmp

In [8]:
#retreives a chunk of the data given the whole thing and one gene
def getChunk(row, snps_df):
    chr = row['chr']
    start = min(row['start'], 0)
    stop = row['stop']
    chunk = snps_df[chr].loc[row['start']:row['stop']]
    chunk['CHROM'] = row['id'] + ':' + chr
#     chunk['POS'] = range(1, len(list(chunk.index)) + 1) #this is for popnet, but we want to go back to original index
    
    chunk.set_index(['CHROM', 'POS'], inplace=True)
    return chunk

result = genes_mod_df.apply(getChunk, args=(dfs,), axis=1)
result_df = pd.concat(result.tolist())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [9]:
result_df.reset_index(inplace=True)
# # result_df.drop(['index'], axis=1, inplace=True)
result_df['POS'] = result_df['POS'].apply(int)
print(result_df)

                             CHROM    POS PD0459-Cx PD0460-C PD0461-C  \
0        PF3D7_0102100:Pf3D7_01_v3  95231         A        A        A   
1        PF3D7_0102200:Pf3D7_01_v3  98866         G        G        T   
2        PF3D7_0102200:Pf3D7_01_v3  98978         C        C        T   
3        PF3D7_0102200:Pf3D7_01_v3  99226         C        C        C   
4        PF3D7_0102200:Pf3D7_01_v3  99273         A        A        A   
...                            ...    ...       ...      ...      ...   
30707     malmito_rna_24:Pf_M76611    247         T        T        T   
30708     malmito_rna_26:Pf_M76611   1776         T        T        T   
30709   malmito_rna_LSUC:Pf_M76611    204         C        C        C   
30710   malmito_rna_LSUF:Pf_M76611   1572         C        C        C   
30711  malmito_rna_RNA12:Pf_M76611   4913         G        G        G   

      PD0462-C PD0464-C PD0466-C PD0467-C PD0468-C  ... QE0469-C QE0470-C  \
0            A        A        A        A     

In [10]:
result_df.to_csv(out_path, sep='\t', index=False)