In [79]:
# Setup - import library and load data from local file
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

fm = pd.read_csv("data/file_mapping_responses.csv")
fm.head(4)
fm = fm.iloc[:,1:11] #subset the data to just strings and remove the notes columns

## Spread of cards matched to term across different terms

GO Term was reasonably consistent, everything else was much more varied

In [80]:
#"split a multi choice answer, returning a series with a column 
# header and the responses with the header stripped out."
def count_unique_answers( a_series ):
    # variable to store the final response in as we build it
    tokenise = list()
    split_answer = pd.DataFrame(a_series.str.split("|", expand=True))
    for i in split_answer:   
        tokenise.extend(split_answer[i])
        
    response = {}
    for i in tokenise:
        if not pd.isnull(i):  
            if i in response:
                response[i] = response[i]+1
            else:
                response[i] = 1
    return response

result_counts = {}

for i in fm: 
    if (fm[i].dtype == np.object):
        result_counts[fm[i].name] = count_unique_answers(fm[i])
        
results = pd.DataFrame.from_dict(result_counts)
results


Unnamed: 0,homo_sapiens.gff|100287102,homo_sapiens.gff|HGNC:37102,homo_sapiens.gff|DDX11L1,flybase_d_melanogaster.gaf|FBgn0043467,flybase_d_melanogaster.gaf|GO:0048149,ncbi_homo_sapiens.gene_info|9606,ncbi_homo_sapiens.gene_info|1,ncbi_homo_sapiens.gene_info|A1BG,ncbi_homo_sapiens.gene_info|MIM:138670,ncbi_homo_sapiens.gene_info|HGNC:HGNC:5
identifier,15.0,9.0,5.0,13.0,2.0,3.0,9.0,2.0,6.0,4.0
gene,14.0,6.0,7.0,5.0,,1.0,6.0,6.0,2.0,3.0
accession,2.0,3.0,1.0,4.0,1.0,,,1.0,2.0,1.0
symbol,1.0,4.0,8.0,3.0,,2.0,1.0,15.0,1.0,1.0
dataset/database,1.0,3.0,1.0,4.0,1.0,,,,5.0,1.0
h. sapiens,1.0,2.0,,,,3.0,,1.0,,
organism,1.0,1.0,1.0,1.0,,8.0,1.0,1.0,1.0,1.0
name,1.0,2.0,15.0,1.0,,1.0,1.0,7.0,1.0,2.0
go: protein binding,,1.0,,,2.0,,,,,
location,,1.0,,,,,,,,


In [87]:
results = results.transpose()
results.head(10)

Unnamed: 0,homo_sapiens.gff|100287102,homo_sapiens.gff|HGNC:37102,homo_sapiens.gff|DDX11L1,flybase_d_melanogaster.gaf|FBgn0043467,flybase_d_melanogaster.gaf|GO:0048149,ncbi_homo_sapiens.gene_info|9606,ncbi_homo_sapiens.gene_info|1,ncbi_homo_sapiens.gene_info|A1BG,ncbi_homo_sapiens.gene_info|MIM:138670,ncbi_homo_sapiens.gene_info|HGNC:HGNC:5
identifier,15.0,9.0,5.0,13.0,2.0,3.0,9.0,2.0,6.0,4.0
gene,14.0,6.0,7.0,5.0,,1.0,6.0,6.0,2.0,3.0
accession,2.0,3.0,1.0,4.0,1.0,,,1.0,2.0,1.0
symbol,1.0,4.0,8.0,3.0,,2.0,1.0,15.0,1.0,1.0
dataset/database,1.0,3.0,1.0,4.0,1.0,,,,5.0,1.0
h. sapiens,1.0,2.0,,,,3.0,,1.0,,
organism,1.0,1.0,1.0,1.0,,8.0,1.0,1.0,1.0,1.0
name,1.0,2.0,15.0,1.0,,1.0,1.0,7.0,1.0,2.0
go: protein binding,,1.0,,,2.0,,,,,
location,,1.0,,,,,,,,


In [85]:
# GO Term is quite consistent in the matching
results["GO Term"]

homo_sapiens.gff|100287102                  NaN
homo_sapiens.gff|HGNC:37102                 NaN
homo_sapiens.gff|DDX11L1                    NaN
flybase_d_melanogaster.gaf|FBgn0043467      1.0
flybase_d_melanogaster.gaf|GO:0048149      18.0
ncbi_homo_sapiens.gene_info|9606            NaN
ncbi_homo_sapiens.gene_info|1               NaN
ncbi_homo_sapiens.gene_info|A1BG            NaN
ncbi_homo_sapiens.gene_info|MIM:138670      NaN
ncbi_homo_sapiens.gene_info|HGNC:HGNC:5     NaN
Name: GO Term, dtype: float64

In [86]:
# Symbol is the next best and is already a mess - it's used in a lot of different places.
results["symbol"]

homo_sapiens.gff|100287102                  1.0
homo_sapiens.gff|HGNC:37102                 4.0
homo_sapiens.gff|DDX11L1                    8.0
flybase_d_melanogaster.gaf|FBgn0043467      3.0
flybase_d_melanogaster.gaf|GO:0048149       NaN
ncbi_homo_sapiens.gene_info|9606            2.0
ncbi_homo_sapiens.gene_info|1               1.0
ncbi_homo_sapiens.gene_info|A1BG           15.0
ncbi_homo_sapiens.gene_info|MIM:138670      1.0
ncbi_homo_sapiens.gene_info|HGNC:HGNC:5     1.0
Name: symbol, dtype: float64

## Variation within a given term
When a term is matched to cards, how many different options are given in total, and per person?

In [142]:
#"split a multi choice answer, returning a series with a column 
# header and the responses with the header stripped out."
def count_variation( a_series ):
    response = {}
    # variable to store the final response in as we build it
    split_answer = a_series.str.split("|")
    split_answer = split_answer.sort_values()
    
    unique_terms = list()
    term_combinations = list()
    for terms_list in split_answer:
        if isinstance(terms_list, list):
            #Record unique combinations of terms
            if terms_list not in term_combinations:
                term_combinations.append(terms_list)
            #Record unique terms regardless of combo
            for individual_term in terms_list:
                if individual_term not in unique_terms:
                    unique_terms.append(individual_term)            
            
    response["term_sets_used"] = term_combinations
    response["unique_terms"] = unique_terms
    return pd.DataFrame(response)

print (count_variation(fm["homo_sapiens.gff|100287102"]))

                                    term_sets_used      unique_terms
0                          [accession, identifier]         accession
1                                           [gene]        identifier
2  [gene, dataset/database, identifier, accession]              gene
3                               [gene, h. sapiens]  dataset/database
4                               [gene, identifier]        h. sapiens
5                                     [identifier]              name
6                         [identifier, gene, name]            symbol
7                   [identifier, symbol, organism]          organism
