-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_scquery_mca.py
179 lines (153 loc) · 6.57 KB
/
run_scquery_mca.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# TODO: use scquery_selenium to get cell type names
# 1. get top genes
import pickle
import numpy as np
import pandas as pd
top_genes = pd.read_csv('mca_coarse_top_genes_ratio.csv', index_col=0)
n_genes = 50
# for all cell types...
"""
import scquery_selenium
cell_types = top_genes.columns
print('number of cell types:', len(cell_types))
cell_type_selenium_results = {}
for cell_type in cell_types:
gene_set = list(top_genes[cell_type][:n_genes])
cell_type_selenium_results[cell_type] = scquery_selenium.scquery_gene_set(gene_set)
print(cell_type, cell_type_selenium_results[cell_type])
for cell_type in cell_types:
gene_set = list(top_genes[cell_type][:n_genes])
if cell_type not in cell_type_selenium_results:
cell_type_selenium_results[cell_type] = scquery_selenium.scquery_gene_set(gene_set)
print(cell_type, cell_type_selenium_results[cell_type])
with open('mca_coarse_scquery_results.pkl', 'wb') as f:
pickle.dump(cell_type_selenium_results, f)
"""
###########################################################################################
# TODO: calculate accuracies
with open('mca_coarse_scquery_results.pkl', 'rb') as f:
cell_type_selenium_results = pickle.load(f)
label_cell_types = {key: [v[0] for v in values] for key, values in cell_type_selenium_results.items()}
cell_types_map = {}
cell_types_alternate_map = {}
cell_types_map = {}
cell_types_alternate_map = {}
with open('cell_ontology_to_cellmesh_tabula_muris.tsv') as f:
l0 = f.readline()
for line in f.readlines():
line_data = [x.strip() for x in line.split('\t')]
name = line_data[1]
primary_cellmesh_name = line_data[2]
alternate_cellmesh_names = line_data[3:]
use_cellmesh = line_data[0]
if use_cellmesh != 'n':
cell_types_map[name] = primary_cellmesh_name
cell_types_alternate_map[name] = alternate_cellmesh_names
with open('tm_cell_onto_alternate_names.tsv') as f:
l0 = f.readline()
for line in f.readlines():
line_data = [x.strip() for x in line.split('\t')]
name = line_data[1]
alternate_cell_type_names = line_data[2:]
if name in cell_types_alternate_map:
cell_types_alternate_map[name].extend(alternate_cell_type_names)
mca_cell_names_to_cellmarker = pd.read_table('mca_cell_names_to_cellmarker.tsv')
mca_cell_names_map = {}
import cellmesh
for i, row in mca_cell_names_to_cellmarker.iterrows():
# TODO: match with cell_types_map and cell_types_alternate_map
cell_name = row['mca_cell_name']
if isinstance(row['tabula_muris'], str):
alt_name = row['tabula_muris'].strip()
mca_cell_names_map[cell_name] = cell_types_alternate_map[alt_name]
mca_cell_names_map[cell_name].append(cell_types_map[alt_name])
if isinstance(row['cellmarker_cellonto'], str):
alt_name = row['cellmarker_cellonto'].strip()
try:
mca_cell_names_map[cell_name].append(alt_name)
except:
mca_cell_names_map[cell_name] = [alt_name]
mca_cell_names_map[cell_name].extend([x[1] for x in cellmesh.cellonto_to_cellmesh(alt_name)])
if isinstance(row['cellmesh'], str):
alt_name = row['cellmesh'].strip()
try:
mca_cell_names_map[cell_name].append(alt_name)
except:
mca_cell_names_map[cell_name] = [alt_name]
mca_cell_names_map[cell_name].extend([x[1] for x in cellmesh.cellmesh_to_cellonto(alt_name)])
if isinstance(row['scquery'], str):
alt_name = row['scquery'].strip()
try:
mca_cell_names_map[cell_name].append(alt_name)
except:
mca_cell_names_map[cell_name] = [alt_name]
# append names to alternate_map
with open('tm_to_scquery.tsv') as f:
l0 = f.readline()
for line in f.readlines():
line_data = line.split('\t')
name = line_data[0].strip()
if len(line_data) > 1 and name in mca_cell_names_to_cellmarker['tabula_muris']:
alternate_name = line_data[1].strip()
cell_name = mca_cell_names_to_cellmarker[mca_cell_names_to_cellmarker['tabula_muris']==name].mca_cell_name
mca_cell_names_map[cell_name].append(alternate_name)
# calculate accuracy
label_accuracies = {}
label_extended_accuracies = {}
for key, value in label_cell_types.items():
name = key
accuracies = []
extended_accuracies = []
if name not in mca_cell_names_map:
continue
for v in value:
if v in mca_cell_names_map[name] or v.lower() in name.lower() or name.lower() in v.lower():
accuracies.append(1)
extended_accuracies.append(1)
else:
accuracies.append(0)
has_index = False
for x in mca_cell_names_map[name]:
if not isinstance(x, str):
continue
if v in x:
extended_accuracies.append(0.5)
has_index = True
if not has_index:
extended_accuracies.append(0)
label_accuracies[key] = accuracies
label_extended_accuracies[key] = extended_accuracies
def calculate_precision_recall_curve(accuracies, n_relevant=1, use_extended=False):
"""
https://nlp.stanford.edu/IR-book/html/htmledition/evaluation-of-unranked-retrieval-sets-1.html
https://nlp.stanford.edu/IR-book/html/htmledition/evaluation-of-ranked-retrieval-results-1.html
"""
precision = []
recall = []
mean_avg_precision = 0
num_correct = 0
for i, a in enumerate(accuracies):
if a == 1:
num_correct += 1
if use_extended and a > 0:
num_correct += 1
precision.append(float(min(num_correct, n_relevant))/(i+1))
recall.append(float(min(num_correct, n_relevant))/n_relevant)
prev_recall = 0
for p, r in zip(precision, recall):
mean_avg_precision += (r - prev_recall)*p
prev_recall = r
return precision, recall, mean_avg_precision
# calculate precision-recall curves for all keys
label_map = {}
for key, val in label_extended_accuracies.items():
label_map[key] = calculate_precision_recall_curve(val, use_extended=True)[2]
# take mean over cell types
# average MAP for all methods, over all datasets
map_mean = np.mean([v for v in label_map.values()])
# convert map_method_means to a pandas dict
import pandas as pd
map_cell_types_list = [(key, 50, 'ratio', 'scquery', v) for key, v in label_map.items()]
map_cell_types_list.sort()
map_cell_types = pd.DataFrame(map_cell_types_list, columns=['cell_type', 'n_genes', 'gene_method', 'query_method', 'mean_average_precision'])
map_cell_types.to_csv('MAP_mca_scquery_cell_types.csv', index=None)