### Find Oncogenes
Uses the Cancer Gene Census (CGC) database to identify known oncogenes in the dataset.  
Only oncogenes classified as Tier 1 (meaning that there is substantial evidence of their role in driving cancer) are considered in this analysis.  
More information on the database:  
https://cancer.sanger.ac.uk/census


This paper also identified cancer genes that show differential expression in axolotl regeneration and will be manually added to the list.  
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3591270/


In [None]:
#Import necessary packages
import os
import pandas as pd
import numpy as np

In [None]:
#Changes to Data directory
cwd = os.getcwd()
if 'Data' not in cwd:
    os.chdir('../Data')
    cwd = os.getcwd()
print(cwd)

In [None]:
#Read database and pulls out tier 1 oncogenes
cgc = pd.read_csv('CancerGeneCensus.csv')
cgc.fillna(value='', inplace=True)
onc_idx = cgc['Role in Cancer'].str.contains('oncogene')
cgc_df = cgc[onc_idx]
cgc_df = cgc_df[cgc_df['Tier'] == 1]
oncogenes = list(cgc_df['Gene Symbol'])

In [None]:
#Oncogenes identified in axolotl paper from 2013
manual_add = ['ATF3','EGR1','ETS2','FOS','FOXO1','JUN','JUND','KLF4','KLF6','MYC','ZFP36']

for mn in manual_add:
    if mn not in oncogenes:
        oncogenes.append(mn)
        print(mn + ' added!')

In [None]:
#Reads the gene expression table and gets a list of genes
df = pd.read_csv('aaq0681_TableS5.csv')
tab_genes = list(df.columns)

In [None]:
#Iterates through the table to find oncogenes
axo_onc = [] #Stores found oncogenes

for onc in oncogenes:
    if onc in tab_genes:
        idx = [i for i, e in enumerate(tab_genes) if onc in e]        
        for i in idx:
            axo_onc.append(tab_genes[i])
            
axo_onc = np.unique(axo_onc)

In [None]:
#Saves the list as CSV
file_name = 'Axolotl_oncogenes.csv'
axo_onc_df = pd.DataFrame(axo_onc, columns=['Oncogene'])
axo_onc_df.to_csv(file_name, encoding='utf-8', index=False)


In [None]:
axo_onc_df

In [None]:
axo_onc_df[]

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

def plot_cluster_and_labels(df, lab, title, xlab, ylab, save_name=None, fig_size=(8,6)):
    """
    Creates a scatter plot of the data passed in, and colors each
    data point based on its label. The figure can be saved if a 
    file name is provided.
    Arguments:
        df - dataframe with a columns headers corresponding to lab, 
        xlab and ylab
        lab - string, col header for datapoint labels
        title - string, plot title
        xlab, ylab - string, col header for datapoint coordinates
        save_name - string, filename to save the image under
        fig_size - tuple, plot size
    
    Outputs:
        Scatter plot of data and labels and saved file
    """
    #Gets unique labels and maps it to a color
    labels = list(pd.unique(df[lab]))
    lut = dict(zip(labels, sns.hls_palette(len(labels), l=0.5, s=0.8)))
    
    #Plots the scatter plot
    fig, ax = plt.subplots(1, figsize=fig_size)
    ax.set_xlabel(xlab)
    ax.set_ylabel(ylab)
    ax.set_title(title)
    for i in range(len(labels)):
        idx = df[lab] == labels[i]
        ax.scatter(df.loc[idx, xlab], df.loc[idx, ylab],
                   cmap = lut[labels[i]], alpha=0.6, s=3)
    ax.legend(labels)
    
    #Save plot
    if save_name is not None:
        plt.savefig(save_name)
    
    return

In [None]:
df = pd.read_csv('aa10681_TableS9.csv')
healthy = pd.read_csv('aaq0681_TableS5.csv')

In [None]:
df.head()

In [None]:
plot_cluster_and_labels(healthy, 'ident', 'title', 'tSNE_1', 'tSNE_2', save_name=None, fig_size=(8,6))


In [None]:
df = pd.read_csv('dorothea.csv')

In [None]:
a = list(set(healthy.columns) & set(df['target']))

In [None]:
len(a)

In [None]:
len(list(healthy.columns))