In [5]:
import os
import sys
sys.path.insert(0, 'util/')

import readwrite as rw
import pandas as pd

In [17]:
root = "data/mp"
in_data_path = os.path.join(root, "phenotypes_id_aligner.csv")
raw_mp = pd.read_csv(in_data_path)

In [40]:
useful_mp_df = raw_mp[["PhenoID.1", "Phenotype", "Shown_pheno", "Category"]]
useful_mp_df.columns = ["PhenoID", "Phenotype", "Shown_pheno", "Category"]
lines = useful_mp_df.values.tolist()

In [41]:
from collections import namedtuple

class PhenotypeInfo(object):
    def __init__(self, line):
        self.id = line[0]
        self.phenotype = line[1]
        self.shown_pheno = line[2]
        self.category = line[3]

def build_classes(lines):
    return [PhenotypeInfo(line) for line in lines]

lines_class = build_classes(lines)

In [62]:
import re

def clean_blank(pheno):
    pheno = filter(lambda x: x != "", pheno.split(" "))
    return " ".join(pheno)


def clean_pheno(line):
    pheno = line.phenotype
    pattern = re.compile("(?:\[.*?\]|\(.*?\))", re.X)
    pheno = re.sub(pattern, "", pheno)
    pheno = clean_blank(pheno)
    line.phenotype = pheno
    return line

def clean_phenos(lines_class):
    return [clean_pheno(line) for line in lines_class]

def get_info(lines_class, v):
    return list(set([getattr(line, v) for line in lines_class]))


def summary_info(lines_class):
    categories = get_info(lines_class, "category")
    shown_phenos = get_info(lines_class, "shown_pheno")
    return categories, shown_phenos

categories, shown_phenos = summary_info(lines_class)
categories

['Drug response',
 'Anxiety',
 'Musculoskeletal system',
 'Blood chemistry',
 'Microbiome',
 'Immune',
 'Other',
 'Activity',
 'Blood metabolites',
 'Morphology',
 'Nervous system',
 'Metabolism']

In [66]:
import groupby as gp

def group_info(lines_class):
    category_ids = [(line.category, line.id) for line in lines_class]
    shownpheno_ids = [(line.shown_pheno, line.id) for line in lines_class]

    categories_grouped_ids = gp.group_by(category_ids, index=0)
    shownpheno_grouped_ids = gp.group_by(shownpheno_ids, index=0)
    
    return categories_grouped_ids, shownpheno_grouped_ids

categories_grouped_ids, shownpheno_grouped_ids = group_info(lines_class)

In [67]:
for k, v in categories_grouped_ids:
    print k
    print v

<itertools.groupby at 0x7f2873baba48>