In [2]:
from pycirclize import Circos
from pycirclize.utils import ColorCycler, load_eukaryote_example_dataset
import numpy as np
np.random.seed(0)

# For Gene Amplifications

In [3]:
import pandas as pd

file_path = './amp.csv'

df = pd.read_csv(file_path, usecols=['Chromosome', 'Wide peak start', 'Wide peak end', 'q values', 'Description'])

df["negLogQ"] = np.log10(df["q values"]) * -1

def extract_numeric_part(chromosome):
    return int(chromosome[3:])

df['Chromosome_num'] = df['Chromosome'].apply(extract_numeric_part)

# Sort the DataFrame based on the numeric part of the chromosome names
df = df.sort_values(by='Chromosome_num')

df.rename(columns={'Chromosome': 'Chr', 
                       'Wide peak start': 'Start', 
                       'Wide peak end': 'End'}, inplace=True)


print(df["Chr"].value_counts())
df

Chr
chr1     8
chr19    7
chr12    7
chr8     6
chr2     5
chr17    4
chr3     3
chr20    3
chr6     3
chr18    3
chr14    2
chr11    2
chr10    2
chr7     2
chr5     2
chr4     2
chr15    1
chr22    1
Name: count, dtype: int64


Unnamed: 0,Chr,Start,End,q values,Description,negLogQ,Chromosome_num
0,chr1,39109153,41523062,1.280000e-21,1p34.2(MYCL1),20.892790,1
1,chr1,148646436,149486905,5.190000e-18,1q21.2(MCL1),17.284833,1
2,chr1,232526850,233617929,8.670000e-07,1q42.3(IRF2BP2),6.061981,1
3,chr1,154088696,154329131,3.880000e-09,---,8.411168,1
4,chr1,36338557,36480225,2.120000e-11,1p34.3,10.673664,1
...,...,...,...,...,...,...,...
58,chr19,20823633,21917935,3.310000e-05,---,4.480172,19
61,chr20,45338887,45497265,4.886900e-04,ZMYND8,3.310967,20
59,chr20,61411961,62435964,9.720000e-06,20q13.33,5.012334,20
60,chr20,29703246,29808748,3.474400e-04,BCL2L1,3.459120,20


In [5]:
def check_chromosome_presence(df):
    # List of chromosome names to check
    chromosomes_to_check = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 
                            'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 
                            'chr20', 'chr21', 'chr22', 'chrX', 'chrY']

    # Check if any of the chromosome names are present in the DataFrame
    for chromosome in chromosomes_to_check:
        if chromosome in df['Chr'].values:
            print(f"At least one row with chromosome {chromosome} is present.")
        else:
            print(f"No row with chromosome {chromosome} is present.")

check_chromosome_presence(df)

At least one row with chromosome chr1 is present.
At least one row with chromosome chr2 is present.
At least one row with chromosome chr3 is present.
At least one row with chromosome chr4 is present.
At least one row with chromosome chr5 is present.
At least one row with chromosome chr6 is present.
At least one row with chromosome chr7 is present.
At least one row with chromosome chr8 is present.
No row with chromosome chr9 is present.
At least one row with chromosome chr10 is present.
At least one row with chromosome chr11 is present.
At least one row with chromosome chr12 is present.
No row with chromosome chr13 is present.
At least one row with chromosome chr14 is present.
At least one row with chromosome chr15 is present.
No row with chromosome chr16 is present.
At least one row with chromosome chr17 is present.
At least one row with chromosome chr18 is present.
At least one row with chromosome chr19 is present.
At least one row with chromosome chr20 is present.
No row with chromos

In [6]:
output_file_path = './amp_sorted.csv'
df.to_csv(output_file_path, index=False)

# For Gene Deletions

In [8]:

file_path = './del.csv'

del_df = pd.read_csv(file_path, usecols=['Chromosome', 'Wide peak start', 'Wide peak end', 'q values', 'Description'])
del_df["negLogQ"] = np.log10(del_df["q values"]) * -1
del_df['Chromosome_num'] = del_df['Chromosome'].apply(extract_numeric_part)

del_df.rename(columns={'Chromosome': 'Chr', 
                       'Wide peak start': 'Start', 
                       'Wide peak end': 'End'}, inplace=True)

# Sort the DataFrame based on the numeric part of the chromosome names
del_df = del_df.sort_values(by='Chromosome_num')
print(del_df["Chr"].value_counts())
del_df.head()

Chr
chr5     5
chr1     4
chr19    4
chr16    4
chr3     3
chr6     3
chr17    3
chr9     3
chr13    2
chr21    2
chr15    2
chr11    2
chr2     2
chr10    2
chr8     2
chr4     2
chr12    1
chr14    1
chr18    1
chr7     1
chr22    1
Name: count, dtype: int64


Unnamed: 0,Chr,Start,End,q values,Description,negLogQ,Chromosome_num
0,chr1,16666761,28781645,1.38e-18,1p36.11,17.860121,1
1,chr1,1,23549970,7.78e-11,---,10.10902,1
2,chr1,56202184,118877972,0.10736,---,0.969157,1
3,chr1,204477622,220287854,0.17853,---,0.748289,1
4,chr2,234758443,242951149,8.74e-07,2q37.3,6.058489,2


In [9]:
check_chromosome_presence(del_df)

At least one row with chromosome chr1 is present.
At least one row with chromosome chr2 is present.
At least one row with chromosome chr3 is present.
At least one row with chromosome chr4 is present.
At least one row with chromosome chr5 is present.
At least one row with chromosome chr6 is present.
At least one row with chromosome chr7 is present.
At least one row with chromosome chr8 is present.
At least one row with chromosome chr9 is present.
At least one row with chromosome chr10 is present.
At least one row with chromosome chr11 is present.
At least one row with chromosome chr12 is present.
At least one row with chromosome chr13 is present.
At least one row with chromosome chr14 is present.
At least one row with chromosome chr15 is present.
At least one row with chromosome chr16 is present.
At least one row with chromosome chr17 is present.
At least one row with chromosome chr18 is present.
At least one row with chromosome chr19 is present.
No row with chromosome chr20 is present.

In [10]:
output_file_path = './del_sorted.csv'
del_df.to_csv(output_file_path, index=False)

# Create GFF files for arrow circos plot with gene description

In [11]:
# merge amp and del genes
df["type"] = "amp"
del_df["type"] = "del"
frames = [df, del_df]

result = pd.concat(frames, ignore_index=True)
result


Unnamed: 0,Chr,Start,End,q values,Description,negLogQ,Chromosome_num,type
0,chr1,39109153,41523062,1.280000e-21,1p34.2(MYCL1),20.892790,1,amp
1,chr1,148646436,149486905,5.190000e-18,1q21.2(MCL1),17.284833,1,amp
2,chr1,232526850,233617929,8.670000e-07,1q42.3(IRF2BP2),6.061981,1,amp
3,chr1,154088696,154329131,3.880000e-09,---,8.411168,1,amp
4,chr1,36338557,36480225,2.120000e-11,1p34.3,10.673664,1,amp
...,...,...,...,...,...,...,...,...
108,chr19,1,306931,3.030000e-56,19p13.3,55.518557,19,del
109,chr19,45712960,55025072,4.274900e-02,---,1.369074,19,del
110,chr21,41501909,46944323,4.164300e-02,---,1.380458,21,del
111,chr21,1,23298229,1.032700e-04,---,3.986026,21,del


In [12]:
# create gff file

import pandas as pd

# Mapping the type to GFF strand symbol
result['Strand'] = result['type'].apply(lambda x: '+' if x == 'amp' else '-')

result.to_csv("./final_gene.csv")

# Creating the GFF format as specified
gff_df = pd.DataFrame({
    "seqname": "ovarian",
    "source": "python",
    "feature": result["type"],
    "start": result["Start"],
    "end": result["End"],
    "score": result["negLogQ"],
    "strand": result['Strand'],
    "frame": ".",
    "attribute": "ID=" + result["Chr"] + ";comments=" + result["Chr"] + "_" + result["Description"]
})

# Creating the GFF formatted content
gff_content = gff_df.to_csv(sep='\t', index=False, header=False)
gff_file_path = './gene_GFF.txt'
with open(gff_file_path, 'w') as file:
    file.write(gff_content)

gff_file_path

'./gene_GFF.txt'

# Still need to save txt file for compare circos plot

In [9]:
import pandas as pd

df = pd.read_csv("final_gene.csv")
df.drop(columns=df.columns[0], axis=1, inplace=True)
df.drop(columns=df.columns[4], axis=1, inplace=True)
df.drop(columns=df.columns[5], axis=1, inplace=True)
df.drop(columns=df.columns[6], axis=1, inplace=True)

df['dist'] = df['End'] - df['Start']

# Determine the maximum distance for each chromosome and add it as a new column
df['max_distance'] = df.groupby('Chr')['dist'].transform('max')

# Add the minimum start value for each chromosome to the DataFrame
df['min_start'] = df.groupby('Chr')['Start'].transform('min')

# Add the maximum end value for each chromosome to the DataFrame
df['max_end'] = df.groupby('Chr')['End'].transform('max')

df['n_start'] = ((df['Start'] - df['min_start']) * df['max_distance']) / (df['max_end'] - df['min_start'])
df['n_end'] = ((df['End'] - df['min_start']) * df['max_distance']) / (df['max_end'] - df['min_start'])


com_df = pd.DataFrame({
    "seqname": df["Chr"],
    "source": "ovarian",
    "feature": df["type"],
    "start": df["n_start"].astype(int).clip(lower=1),
    "end": df["n_end"].astype(int),
    "score": df["negLogQ"],
    "strand": "+",
    "frame": ".",
    "attribute": "ID=" + df["Chr"] + ";comments=" + df["Chr"] 
})

# Creating the GFF formatted content
gff_content = com_df.to_csv(sep='\t', index=False, header=False)
gff_file_path = './compare_pre_GFF.txt'
with open(gff_file_path, 'w') as file:
    file.write(gff_content)

gff_file_path


'./compare_pre_GFF.txt'

In [7]:
import pandas as pd

df = pd.read_csv("final_gene.csv")
df.drop(columns=df.columns[0], axis=1, inplace=True)
# df.drop(columns=df.columns[4], axis=1, inplace=True)
# df.drop(columns=df.columns[4], axis=1, inplace=True)
# df.drop(columns=df.columns[5], axis=1, inplace=True)

df.head()


Unnamed: 0,Chr,Start,End,q values,Description,negLogQ,Chromosome_num,type,Strand
0,chr1,39109153,41523062,1.28e-21,1p34.2(MYCL1),20.89279,1,amp,+
1,chr1,148646436,149486905,5.19e-18,1q21.2(MCL1),17.284833,1,amp,+
2,chr1,232526850,233617929,8.67e-07,1q42.3(IRF2BP2),6.061981,1,amp,+
3,chr1,154088696,154329131,3.88e-09,---,8.411168,1,amp,+
4,chr1,36338557,36480225,2.12e-11,1p34.3,10.673664,1,amp,+


## now create a txt file that stores all the genes with descriptions. 

When making compare amp/del graph, we need to paste our txt data to a self-make gff file while adding the region information based on comparasion.ipynb

In [9]:
import pandas as pd

df = pd.read_csv("final_gene.csv")
df.drop(columns=df.columns[0], axis=1, inplace=True)
# df.drop(columns=df.columns[4], axis=1, inplace=True)
# df.drop(columns=df.columns[5], axis=1, inplace=True)
# df.drop(columns=df.columns[6], axis=1, inplace=True)

df.head()

df['dist'] = df['End'] - df['Start']

# Determine the maximum distance for each chromosome and add it as a new column
df['max_distance'] = df.groupby('Chr')['dist'].transform('max')

# Add the minimum start value for each chromosome to the DataFrame
df['min_start'] = df.groupby('Chr')['Start'].transform('min')

# Add the maximum end value for each chromosome to the DataFrame
df['max_end'] = df.groupby('Chr')['End'].transform('max')

df['n_start'] = ((df['Start'] - df['min_start']) * df['max_distance']) / (df['max_end'] - df['min_start'])
df['n_end'] = ((df['End'] - df['min_start']) * df['max_distance']) / (df['max_end'] - df['min_start'])


com_df = pd.DataFrame({
    "seqname": df["Chr"],
    "source": "ovarian",
    "feature": df["type"],
    "start": df["n_start"].astype(int).clip(lower=1),
    "end": df["n_end"].astype(int),
    "score": df["negLogQ"],
    "strand": "+",
    "frame": ".",
    "attribute": "ID=" + df["Chr"] + ";comments=" + df["Chr"] + "_" + df["Description"]
})

# Creating the GFF formatted content
gff_content = com_df.to_csv(sep='\t', index=False, header=False)
gff_file_path = './compare_with_description_GFF.txt'
with open(gff_file_path, 'w') as file:
    file.write(gff_content)

gff_file_path


'./compare_with_description_GFF.txt'

# Remeber to manually copy the txt file data into a self-made .gff file. And add the line of 

ovarian	python	region	1	60942	.	+	.	ID=NC_000902.1:1..60942;Dbxref=taxon:97081;gbkey=Src;genome=proviral;mol_type=genomic DNA;nat-host=Escherichia coli O157:H7;synonym=Bacteriophage VT2-Sa

## But we still need the range for all the chr, so we need to find max and min

In [11]:
print(result["Start"].min(), result["End"].max())

1 247249719


In [14]:
chr1 = result[result["Chr"]=="chr1"]
chr1["Start"].max()

241440102