In [None]:
#imports 
import multiprocessing
import re 
from collections import Counter

import feather
import matplotlib as mpt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objs as go
import plotly.plotly as py
import scipy as sc
import seaborn as sns
import pickle
from jupyterthemes import jtplot
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

jtplot.style(
    theme='grade3',
    context='paper',
    fscale=1.4,
    spines=True,
    gridlines='--',
    ticks=True,
    grid=False,
    figsize=(6, 4.5))

#mpt.rcParams

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

ROOT = "/Users/payalchandak/Not On Dropbox/Data/Sex Demographics/" 

In [None]:
df = pd.read_feather(ROOT + "AEOLUS/aeolus_Nick.tsv.feather")

In [None]:
# Cleaning 

#Drop irrelevant columns 
df=df.drop(labels=[ 
    'drug_outcome_name',
    'aed',
    'age_cat', 
    'tot_drug_outcomes',
    'drug_marginal',
    'outcome_marginal',
    'a',
    'b',
    'c',
    'd',
    'N',
    'RRR',
    'ROR',
    'PRR',
    'IC',
    'IC_posterior_expectation',
    ], axis=1) 

#Capitalize to title case 
df['drug_concept_name'] = df['drug_concept_name'].apply(lambda x: str(x).title())
df['outcome_concept_name'] = df['outcome_concept_name'].apply(lambda x: str(x).title())

#clean aeolus gender ~ only keep M and F 
df = df.query('gender_code=="M" or gender_code=="F"')

#write to file
df.reset_index().drop('index',axis=1).to_feather(ROOT+"AEOLUS/clean_withRank.tsv.feather")

In [None]:
# Snomed-Meddra-Outcome Severity Mapping and adding to 'df'  

outMap = pd.read_csv(ROOT + "Outcome Severity Mapping/MEDDRA SNOMED.txt", delimiter="\t")
outRank = pd.read_excel(ROOT + 'Outcome Severity Mapping/Ranked ADRs.xlsx', sheet_name='Ranked ADRs')

#creating outcomes table with unique outcomes and ranks
#assigns -1 for unmapped 
#assigns 0 for unranked

outcomes = df.loc[:, ['snomed_outcome_concept_id']].drop_duplicates(
).reset_index().loc[:, ['snomed_outcome_concept_id']]
outcomes["outcome_rank"] = np.nan
numOutcomes = outcomes['snomed_outcome_concept_id'].count()

for i in range(0, numOutcomes):
    snowID = outcomes.at[i, 'snomed_outcome_concept_id']
    mapQ = outMap.query('snomed_concept_id == @snowID')
    if mapQ.empty:
        outcomes.at[i, 'outcome_rank'] = -1
    else:
        medName = mapQ.reset_index().loc[0, 'meddra_concept_name'].upper()
        rankQ = outRank.query('Name == @medName')
        if rankQ.empty:
            outcomes.at[i, 'outcome_rank'] = 0
        else:
            outcomes.at[i, 'outcome_rank'] = rankQ.reset_index().at[
                0, 'Rank score']

#Putting outcome ranks in main dataframe
df = df.sort_values(by='snomed_outcome_concept_id')
outcomes = outcomes.sort_values(by='snomed_outcome_concept_id')
df["outcome_rank"] = np.nan
temp_snowID = 0
temp_rank = 0
numPatients = df['id'].count()

for i in range(0, numPatients):
    df_snowID = df.at[i, 'snomed_outcome_concept_id']
    if (temp_snowID != df_snowID):
        temp_outcomes = outcomes.query(
            'snomed_outcome_concept_id == @df_snowID').reset_index()
        temp_rank = temp_outcomes.at[0, 'outcome_rank']
        temp_snowID = temp_outcomes.at[0, 'snomed_outcome_concept_id']
    df.at[i, 'outcome_rank'] = temp_rank

#display counts of outcome ranks
#df.loc[:,['id','outcome_rank']].groupby('outcome_rank').count().sort_values(by='id',ascending=False)