In [136]:
#Data munging
#using Dasgupta script to create risk factor ratios in ORIGINAL dataset
import numpy as np
import pandas as pd
import os
%matplotlib inline
os.chdir('P:\Framingham hip\Original datasets of interest')
os.getcwd()

'P:\\Framingham hip\\Original datasets of interest'

In [137]:
original = pd.read_csv('original_full_columns_renamed.csv')

In [138]:
#beta_1980 beta blockers in 1980 has a funny coding system.  3 = NO and 4 = YES
#for some reason, beta_1980 in importing as a string.  Strings are immutable in python.

recode ={'3':0,'4':1,'5':0}
original['beta_1980'].replace(recode, inplace=True)

In [139]:
original['beta_1980'].value_counts()

0    2019
1     215
        2
Name: beta_1980, dtype: int64

In [140]:
#rename beer1980 so it does not cause problems later.
original.rename(columns={'Beer1980':'beer_1980'},inplace=True)

In [141]:
#estrogen = 8 indicates patient is a man.  Analysis makes most sense to recode as NaN.
# estrogen = 2 means they took it in the past but not now.  Recode as 0.
recode_estrogen = {8.0:np.NaN, 2.0:0} 

In [142]:

original['estrogen_2000'].replace(recode_estrogen, inplace=True)
original['estrogen_2000'].value_counts()

0.0    355
1.0     23
Name: estrogen_2000, dtype: int64

In [143]:
#estrogen 1990 and estrogen_1980 are strings for some reason.
recode_estrogen = {"8":np.NaN, "2":0}
original['estrogen_1990'].replace(recode_estrogen, inplace=True)
original['Estrogen_1980'].replace(recode_estrogen, inplace=True)

In [144]:
original['Estrogen_1980'].value_counts()

0    1995
1      30
        3
0       1
Name: Estrogen_1980, dtype: int64

In [145]:
original['estrogen_1990'].value_counts()

0    1175
1      33
        1
0       1
Name: estrogen_1990, dtype: int64

In [146]:
original['estrogen_2000'].value_counts()

0.0    355
1.0     23
Name: estrogen_2000, dtype: int64

In [147]:
#==============================================================================
# Identify menopause status by year
#==============================================================================

## Fix type of age columns
age_cols = [u for u in original.columns if u.find('age')>-1]
for col in age_cols:
    original[col] = pd.to_numeric(original[col], errors='coerce')

blah = ((original.age16 > original.age_periods_stop) & (original.age_periods_stop < 45)).astype(int)
blah[pd.isnull(original.age16) | pd.isnull(original.age_periods_stop)] = np.nan
original['menopause_1980'] = blah.copy()

blah = ((original.age21 > original.age_periods_stop) & (original.age_periods_stop < 45)).astype(int)
blah[pd.isnull(original.age21) | pd.isnull(original.age_periods_stop)] = np.nan
original['menopause_1990'] = blah.copy()

blah = ((original.age26 > original.age_periods_stop) & (original.age_periods_stop < 45)).astype(int)
blah[pd.isnull(original.age26) | pd.isnull(original.age_periods_stop)] = np.nan
original['menopause_2000'] = blah.copy()

In [148]:
#==============================================================================
# Create tidy data set
#==============================================================================

original_long = original.melt(id_vars = ['PID'])

risk_factors = original_long.loc[original_long['variable'].str.contains('[0-9]{4}$')]
risk_factors['variable'] = risk_factors['variable'].str.replace('_wine','wine') # rationalize wine
bl = risk_factors['variable'].str.split('_', expand=True).iloc[:,:2] # Split variables and year
bl = bl.rename(columns = {0:'vars', 1:'year'})
risk_factors = risk_factors.join(bl)
risk_factors['value'] = pd.to_numeric(risk_factors['value'], errors='coerce') # Make numeric
risk_factors['vars'] = risk_factors['vars'].str.lower() # Make lower case
risk_factors = risk_factors.drop('variable',1)


bl = risk_factors.pivot_table(index = 'PID', values = 'value',
                              columns = ['year','vars'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [149]:
#==============================================================================
# Aggregate wines for 2000
#==============================================================================

d = bl['2000'][['redwine','whitewine','otherwine']]
d['wine'] = d.sum(axis = 1)

bl['2000','wine'] = d['wine']
bl = bl.sort_index(axis=1)
bl = bl.drop([('2000','redwine'),('2000','whitewine'), ('2000','otherwine')],1)

In [150]:
#==============================================================================
# Fix smoking in 1980
#==============================================================================

bl['1980','smoke'] = np.where(bl['1980','smoke']>0, 1, bl['1980','smoke'])

In [151]:
list(bl.columns)


[('1980', 'beer'),
 ('1980', 'beta'),
 ('1980', 'bmi'),
 ('1980', 'cocktail'),
 ('1980', 'diab'),
 ('1980', 'estrogen'),
 ('1980', 'menopause'),
 ('1980', 'smoke'),
 ('1980', 'steroid'),
 ('1980', 'wine'),
 ('1990', 'beer'),
 ('1990', 'beta'),
 ('1990', 'bmi'),
 ('1990', 'cocktail'),
 ('1990', 'diab'),
 ('1990', 'estrogen'),
 ('1990', 'menopause'),
 ('1990', 'smoke'),
 ('1990', 'steroid'),
 ('1990', 'wine'),
 ('2000', 'beer'),
 ('2000', 'beta'),
 ('2000', 'bisphosphonate'),
 ('2000', 'bmi'),
 ('2000', 'cocktail'),
 ('2000', 'diab'),
 ('2000', 'estrogen'),
 ('2000', 'menopause'),
 ('2000', 'smoke'),
 ('2000', 'steroid'),
 ('2000', 'wine')]

In [152]:
#==============================================================================
# Aggregate drinks to get total drinks, then create 
# RF_ETOH = 1 if drinks > 3
#==============================================================================
for u in ['1980','1990','2000']:
    d = bl[u]
    drinks = d[['beer','wine','cocktail']].sum(axis=1)
    drinks[drinks <= 3] = 0
    drinks[drinks > 3] = 1
    bl[u,'rf_etof']= drinks

In [154]:
#==============================================================================
# Summary
#==============================================================================

bl['PID'] = bl.index
risk_factors = bl.melt(id_vars = 'PID')
pd.pivot_table(risk_factors, values ='value', index = 'year', 
               columns = 'vars', aggfunc=np.nanmean).to_excel('RiskFactors_original.xlsx')

In [156]:
bl.to_csv('original.munging_done_10-9-17')

NameError: name 'offspring' is not defined