Title: Gsp1 EMAP clustering analysis
Date: 2019 April 02
Author: Chris Mathy
Email: {chris.mathy@ucsf.edu, cjmathy@gmail.com}
Description: Notebook to implement Clustering functional of
    Cluster 3.0 in Python, using the Biopython module
    Bio.Cluster

http://bonsai.hgc.jp/~mdehoon/software/cluster/software.htm#pycluster
http://bonsai.hgc.jp/~mdehoon/software/cluster/cluster.pdf

In [5]:
import Bio
Bio.Cluster.__version__

'1.55'

In [287]:
from Bio import Cluster
import numpy as np
import pandas as pd
from collections import defaultdict

In [40]:
# we read in the EMAP as a handle, then read the handle into a record object
handle = open('gsp1_pEMAP_avg_merged_gene_names.txt')
record = Cluster.read(handle)

# our record contains:
# -- the emap scores in record.data
# -- the mutant names in record.geneid
# -- the library gene names in record.expid
# -- a "mask" matrix showing 0's for missing data in record.mask


In [253]:
# cluster with centered correlation (dist=’c’) and average clustering (method == ’v’)

# cluster rows (mutants)
mutant_clustered = record.treecluster(transpose=False, method='a', dist='c')

# cluster columns (library genes)
library_clustered = record.treecluster(transpose=True, method='a', dist='c')

# scale each, so that cluster distances are between zero and one,
# for ease of viewing in Java TreeView
mutant_clustered.scale()
library_clustered.scale()

In [254]:
record.save('avg_clustering', mutant_clustered, library_clustered)

In [269]:
def cut_and_get_clusters(tree, nclusters, list_of_ids):
    
    # cut tree to get cluster assignments
    assigned_clusters = tree.cut(nclusters)
    
    # count the number of members in each cluster
    _, n_in_cluster = np.unique(assigned_clusters, return_counts=True)
    
    # count the number of clusters of a given size, return as a dict
    n_members, n_clusters_of_that_size = np.unique(n_in_cluster, return_counts=True)
    cluster_count_by_size = dict(zip(n_members, n_clusters_of_that_size))
    
    # make a dict of cluster:list pairs, where the list contains member names
    clusters = defaultdict(list)
    for i, cluster in enumerate(assigned_clusters):
        clusters[cluster].append(list_of_ids[i])
        
    return dict(clusters), cluster_count_by_size

In [621]:
cluster_dict, cluster_count_by_size = cut_and_get_clusters(library_clusters, 1200, record.expid)

In [622]:
{k:v for k,v in cluster_dict.items() if len(v) > 3}

{428: ['MDM10',
  'YUR1',
  'CBF1',
  'GAL80',
  'RPS1B',
  'UFO1',
  'SGS1',
  'BUL1',
  'NGL2',
  'ALG6'],
 888: ['SWC3', 'SWR1', 'VPS72', 'ARP6', 'VPS71', 'HTZ1'],
 322: ['DEP1', 'RXT2', 'SAP30', 'PHO23'],
 1083: ['LTE1', 'PBY1', 'MUS81', 'CLA4'],
 378: ['GEM1', 'MDM34', 'PFK2', 'MDM12'],
 834: ['BUD14', 'KIN3', 'SUB1', 'MOT3', 'SCS7', 'MSG5'],
 1085: ['IML3',
  'CHL4',
  'MCM21',
  'NKP1',
  'MAD1',
  'MAD3',
  'MAD2',
  'MCM22',
  'IRC15',
  'CTF19',
  'RMI1',
  'MCM16'],
 376: ['MUD1', 'GBP2', 'RAD18', 'YER077C', 'SPT23', 'NOT5'],
 408: ['MRPL36', 'UBC5', 'CBT1', 'AMD1'],
 846: ['PTC4', 'HUL5', 'LAG1', 'YAP5'],
 1119: ['ARL1', 'SYS1', 'SNU66', 'ARL3'],
 379: ['SSH1', 'UBP12', 'YAP1', 'FAR3'],
 38: ['SGF29', 'ELP2', 'IKI1', 'ELP6', 'ELP3', 'ELP4'],
 425: ['PET18', 'DIE2', 'RPB4', 'ASH1', 'AEP2', 'HST1', 'NTO1'],
 380: ['RIM1',
  'IMG2',
  'CRD1',
  'SUM1',
  'RPO41',
  'LOS1',
  'MDM38',
  'GSH2',
  'PET123',
  'DGK1',
  'MIP1'],
 409: ['RBK1', 'FIG2', 'HST4', 'PCT1', 'HUL4', 'GOT

In [618]:
library_gene_names = list(pd.read_csv('gsp1_pEMAP_avg_merged_gene_names.txt', sep='\t').columns)[1:]
library_gene_ORFs = list(pd.read_csv('gsp1_pEMAP_avg_merged.txt', sep='\t').columns)[1:]
gene_names_to_ORF = pd.DataFrame({'name': library_gene_names,'ORF': library_gene_ORFs})

# remove ' - DAmP' and turn 'ORF - ORF' into 'ORF'
gene_names_to_ORF['name'], _ = gene_names_to_ORF['name'].str.split(' - ', 1).str
gene_names_to_ORF['ORF'], _ = gene_names_to_ORF['ORF'].str.split(' - ', 1).str

In [619]:
# get annotations from SGD, code generated from the following URL:
# "https://yeastmine.yeastgenome.org/yeastmine/results.do?trail=%257Cquery"

from intermine.webservice import Service
service = Service("https://yeastmine.yeastgenome.org:443/yeastmine/service")
query = service.new_query("Gene")
query.add_view("secondaryIdentifier", "symbol", "name", "length", "sgdAlias", "description")
query.add_constraint("status", "IS NULL", code = "D")
query.add_constraint("status", "=", "Active", code = "C")
query.add_constraint("dataSets.name", "=", "SGD data set", code = "F")
query.add_constraint("organism.name", "=", "Saccharomyces cerevisiae", code = "E")
query.set_logic("(C or D) and E and F")

SGD_annotations = pd.DataFrame(query.results('dict'))
cols = list(SGD_annotations.columns)
SGD_annotations.columns = [col.split('.')[1] for col in cols]
SGD_annotations = SGD_annotations.drop(['cytoLocation','featAttribute','geneSummary',
                       'id','length','primaryIdentifier','qualifier',
                       'score','scoreType','status'], axis=1)
SGD_to_merge = SGD_annotations[['symbol','secondaryIdentifier','description','name']]
SGD_to_merge.columns = ['name', 'ORF', 'description', 'name_meaning']

In [620]:
df = pd.merge(gene_names_to_ORF, SGD_to_merge, how='left', on=['name','ORF'])

# add in hand-curated descriptions from SGD that weren't in the yeastmine download
# note, many unknown gene descriptions were left out
df = pd.merge(df, pd.read_csv('missing_descriptions.txt', sep='\t'), how = 'outer')

# drop if description is empty (won't add any information)
SGD_descriptions = df.loc[~pd.isnull(df.description)]