In [40]:
#Data munging
#using Dasgupta script to create risk factor ratios in OFFSPRING dataset
import numpy as np
import pandas as pd
import os
%matplotlib inline
os.chdir('P:\Framingham hip\Framingham Offsping Cohort datasets of Interest')
os.getcwd()

'P:\\Framingham hip\\Framingham Offsping Cohort datasets of Interest'

In [41]:
offspring = pd.read_csv('offspring_full_2000_redone_columns_bmi.csv')

In [42]:
#rename columns to homogenize with 'original' dataset
offspring.rename(columns={'Period_STOP_AGE':'age_periods_stop'},inplace=True)

#Beta_1980 beta blockers in 1980 has a funny coding system.  3 = NO and 4 = YES
recode ={3:0,4:1,5:0}


In [43]:
offspring['Beta_1980'].value_counts()

3.0    3595
4.0     152
5.0      28
Name: Beta_1980, dtype: int64

In [44]:
offspring['Beta_1980'].replace(recode, inplace=True)

In [45]:
offspring['Beta_1980'].value_counts()

0.0    3623
1.0     152
Name: Beta_1980, dtype: int64

In [55]:
offspring['Steroid_2000'].value_counts()   #Steroid is sparsely populated and has no zeros.  I think we will ignore this.
# if necessary, can recode steroid as 0 if person attended that visit.

1.0    55
Name: Steroid_2000, dtype: int64

In [46]:
## Fix type of age columns
age_cols = [u for u in offspring.columns if u.find('age')>-1]
for col in age_cols:
    offspring[col] = pd.to_numeric(offspring[col], errors='coerce')

blah = ((offspring.age2 > offspring.age_periods_stop) & (offspring.age_periods_stop < 45)).astype(int)
blah[pd.isnull(offspring.age2) | pd.isnull(offspring.age_periods_stop)] = np.nan
offspring['menopause_1980'] = blah.copy()

blah = ((offspring.age4 > offspring.age_periods_stop) & (offspring.age_periods_stop < 45)).astype(int)
blah[pd.isnull(offspring.age4) | pd.isnull(offspring.age_periods_stop)] = np.nan
offspring['menopause_1990'] = blah.copy()

blah = ((offspring.age7 > offspring.age_periods_stop) & (offspring.age_periods_stop < 45)).astype(int)
blah[pd.isnull(offspring.age7) | pd.isnull(offspring.age_periods_stop)] = np.nan
offspring['menopause_2000'] = blah.copy()

In [47]:
# Create tidy data set
offspring_long = offspring.melt(id_vars = ['PID'])

risk_factors = offspring_long.loc[offspring_long['variable'].str.contains('[0-9]{4}$')]
risk_factors['variable'] = risk_factors['variable'].str.replace('_wine','wine') # rationalize wine
bl = risk_factors['variable'].str.split('_', expand=True).iloc[:,:2] # Split variables and year
bl = bl.rename(columns = {0:'vars', 1:'year'})
risk_factors = risk_factors.join(bl)
risk_factors['value'] = pd.to_numeric(risk_factors['value'], errors='coerdce') # Make numeric
risk_factors['vars'] = risk_factors['vars'].str.lower() # Make lower case
risk_factors = risk_factors.drop('variable',1)


bl = risk_factors.pivot_table(index = 'PID', values = 'value',
                              columns = ['year','vars'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [48]:
# Aggregate wines for 2000
d = bl['2000'][['redwine','whitewine']]  #no otherwine in offspring dataset
d['wine'] = d.sum(axis = 1)

bl['2000','wine'] = d['wine']
bl = bl.sort_index(axis=1)
bl = bl.drop([('2000','redwine'),('2000','whitewine')],1)

In [49]:
bl

year,1980,1980,1980,1980,1980,1980,1980,1980,1990,1990,...,2000,2000,2000,2000,2000,2000,2000,2000,2000,conj
vars,beer,beta,bmi,cocktail,estrogen,menopause,smoke,wine,beer,beta,...,beta,bisphosphonates,bmi,cocktail,estrogen,menopause,smoke,steroid,wine,estrogen
PID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2924,0.0,0.0,22.648361,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,25.32,0.0,0.0,0.0,0.0,,0.0,0.0
3297,36.0,0.0,26.476584,0.0,8.0,,1.0,0.0,,,...,,,,,,,,,0.0,
4040,,,,,,,,,,,...,,,,,,,,,0.0,
4061,18.0,1.0,31.644676,0.0,8.0,,1.0,0.0,16.0,1.0,...,1.0,0.0,33.33,0.0,8.0,,0.0,,0.0,8.0
4301,,,,,,,,,,,...,,,,,,,,,0.0,
5350,,,,,,,,,0.0,0.0,...,1.0,0.0,,1.0,2.0,0.0,0.0,,1.0,0.0
6719,,,,,,,,,,,...,,,,,,,,,0.0,
7248,,,,,,,,,0.0,0.0,...,0.0,0.0,32.03,0.0,0.0,0.0,0.0,,0.0,0.0
8059,0.0,0.0,23.840380,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,,0.0,0.0,0.0,1.0,,1.0,0.0
8459,,,,,,,,,,,...,,,,,,,,,0.0,


In [50]:
#==============================================================================
# Fix smoking in 1980
#==============================================================================

bl['1980','smoke'] = np.where(bl['1980','smoke']>0, 1, bl['1980','smoke'])


In [51]:
# Aggregate drinks to get total drinks, then create 
# RF_ETOH = 1 if drinks > 3
#==============================================================================
for u in ['1980','1990','2000']:
    d = bl[u]
    drinks = d[['beer','wine','cocktail']].sum(axis=1)
    drinks[drinks <= 3] = 0
    drinks[drinks > 3] = 1
    bl[u,'rf_etof']= drinks

In [52]:
bl['1980','rf_etof']
#should be rf_etoh, but why quibble.

PID
2924       0.0
3297       1.0
4040       0.0
4061       1.0
4301       0.0
5350       0.0
6719       0.0
7248       0.0
8059       0.0
8459       0.0
9458       0.0
9912       0.0
10791      0.0
11016      0.0
11651      0.0
14463      1.0
14764      0.0
15059      0.0
15398      0.0
18098      0.0
18103      0.0
18844      0.0
19353      0.0
19892      0.0
20206      1.0
22267      0.0
22567      0.0
22996      0.0
24738      0.0
25240      0.0
          ... 
9939621    1.0
9939888    1.0
9940525    0.0
9940870    1.0
9941870    1.0
9943257    0.0
9945338    0.0
9947765    0.0
9947948    0.0
9948725    1.0
9951579    0.0
9957447    0.0
9958150    0.0
9965528    1.0
9967264    0.0
9973742    0.0
9975906    0.0
9977699    1.0
9980291    1.0
9982799    0.0
9982997    0.0
9985882    0.0
9986747    1.0
9987740    1.0
9988354    1.0
9988615    0.0
9989304    0.0
9990078    0.0
9995187    0.0
9995880    1.0
Name: (1980, rf_etof), Length: 5507, dtype: float64

In [54]:
# Summary
#==============================================================================

bl['PID'] = bl.index
risk_factors = bl.melt(id_vars = 'PID')
pd.pivot_table(risk_factors, values ='value', index = 'year', 
               columns = 'vars', aggfunc=np.nanmean).to_excel('RiskFactors_offspring.xlsx')

In [56]:
bl.to_csv('offspring_munging_done_10-6-17.csv')