#Overview
This is an exploratory search for cis eQTLs of ESR1 in liver tissue from "A resource for integrated genomic analysis of the human liver" Zhou et al 2022. Data came from:https://github.com/zhouLabNCSU/Liver_project_resources



In [10]:
import pandas as pd 
import os

#check for headers
with open("UNC_cis-eQTL_zhou_2022.gff", "r") as f:
    for _ in range(10):
        print(f.readline().strip())
#none found. no colnames. gpt says use standard gff colnames


##gff-version 3
##Note: See http://song.sourceforge.net

# file: UNC_cis-eQTL.gff
chr1	UNC_eQTL	cis_eQTL_segment	1104153	1172777	-1	+	.	ID=237973_UNC_eQTL;Name=hsa-mir-6723
chr1	UNC_eQTL	Gene_body	566454	567996	-1	+	.	Parent=237973_UNC_eQTL
chr1	UNC_eQTL	UNC_cis_eQTL	1104154	1104154	4.306	+	.	Parent=237973_UNC_eQTL;Name=hsa-mir-6723;Note=rs111420185;Alias=rs111420185
chr1	UNC_eQTL	UNC_cis_eQTL	1119426	1119426	3.67	+	.	Parent=237973_UNC_eQTL;Name=hsa-mir-6723;Note=rs74586415;Alias=rs74586415
chr1	UNC_eQTL	UNC_cis_eQTL	1119636	1119636	3.668	+	.	Parent=237973_UNC_eQTL;Name=hsa-mir-6723;Note=rs75418966;Alias=rs75418966
chr1	UNC_eQTL	UNC_cis_eQTL	1171379	1171379	3.735	+	.	Parent=237973_UNC_eQTL;Name=hsa-mir-6723;Note=rs115111053;Alias=rs115111053


In [18]:

data_path = os.path.join(os.getcwd(), "UNC_cis-eQTL_zhou_2022.gff")
df = pd.read_csv(data_path, sep='\t', comment='#', header=None)
df.columns = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"]
df.shape


(732462, 9)

In [25]:
#looking for gene names now
for count in range(4):\
print(df.loc[count,'attributes'])

ID=237973_UNC_eQTL;Name=hsa-mir-6723
Parent=237973_UNC_eQTL
Parent=237973_UNC_eQTL;Name=hsa-mir-6723;Note=rs111420185;Alias=rs111420185
Parent=237973_UNC_eQTL;Name=hsa-mir-6723;Note=rs74586415;Alias=rs74586415


In [21]:
df.head(5)

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
0,chr1,UNC_eQTL,cis_eQTL_segment,1104153,1172777,-1.0,+,.,ID=237973_UNC_eQTL;Name=hsa-mir-6723
1,chr1,UNC_eQTL,Gene_body,566454,567996,-1.0,+,.,Parent=237973_UNC_eQTL
2,chr1,UNC_eQTL,UNC_cis_eQTL,1104154,1104154,4.306,+,.,Parent=237973_UNC_eQTL;Name=hsa-mir-6723;Note=...
3,chr1,UNC_eQTL,UNC_cis_eQTL,1119426,1119426,3.67,+,.,Parent=237973_UNC_eQTL;Name=hsa-mir-6723;Note=...
4,chr1,UNC_eQTL,UNC_cis_eQTL,1119636,1119636,3.668,+,.,Parent=237973_UNC_eQTL;Name=hsa-mir-6723;Note=...


In [None]:
#we need to extract the eQTL gene names and rsid from the attributes. We've learned there's five features stashed in attributes
import re

def extract_attribute(string):
    '''
    Extracts the squashed string columns from this gff file, returning their values as a series.
    
    parameters:
    string: str
        The GFF attribute column as a string

    returns:
    pd.Series
        A pandas series containing id,parent,name,note and alias values for each row. 

    
    notes:
    Recompiles regex on every call, inefficient for large datasets but untroubling for datasets <3,000,000 rows
    '''
    id = re.search(r"ID=([^;]+)", string)
    parent = re.search(r"Parent=([^;]+)", string)
    name = re.search(r"Name=([^;]+)", string)
    note = re.search(r"Note=([^;]+)", string)
    alias = re.search(r"Alias=([^;]+)", string)
    return pd.Series([id.group(1) if id else None,
                    parent.group(1) if parent else None,
                    name.group(1) if name else None,
                    note.group(1) if note else None,
                    alias.group(1) if alias else None])

#extract the columns
df[['id','parent','name','note','alias']] = df['attributes'].apply(lambda x: extract_attribute(x))

#check whether extraction went to plan
print(df.columns)
print(df.head(3))


In [None]:
#now we filter for eQTLs of ESR1 and known ESR1 downstream genes

