### Social Network Analysis
##### Combine new variables and create diversity indices
##### Oct 06, 2021

In [79]:
import os
import pandas as pd
import numpy as np
from math import log as ln
import networkx as nx
from pyvis.network import Network
import matplotlib.pyplot as plt

In [80]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_colwidth', None)

### Loading datasets

In [81]:
df = pd.read_sas('vr_sntwk_2008_m_0641s_v2_16.sas7bdat',format = 'sas7bdat', encoding="unicode_escape")
demo_df = pd.read_sas('vr_wkthru_ex09_1_1001s_16.sas7bdat',format = 'sas7bdat', encoding="unicode_escape")
apoe_df = pd.read_sas('coh_off_apoe_16.sas7bdat',format = 'sas7bdat', encoding="unicode_escape")
ex_df = pd.read_sas('ex1_7s_v2_16.sas7bdat',format = 'sas7bdat', encoding="unicode_escape",index='idr')
ed_df = pd.read_sas('vr_np_2018_a_1185s_19.sas7bdat',format = 'sas7bdat', encoding="unicode_escape")


  rslt[name] = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d")
  rslt[name] = self._string_chunk[js, :]


### Cleaning education df

In [82]:
ed_df['idr'] = ed_df['idr'].astype(int).astype(str)
ed_df = ed_df.groupby('idr',as_index=False).first()
ed_df['EDU_COLLEGE'] = ed_df['EDUCG'].apply(lambda x: 'Some college' if x>1 else 'No college')
ed_df['EDU_HS'] = ed_df['EDUCG'].apply(lambda x: 'HS grad' if x>=1 else 'No HS')

ed_df = ed_df[['idr','EDUCG','EDU_COLLEGE','EDU_HS']]

### Cleaning exam df

In [83]:
ex_df = ex_df.iloc[:,446:509].reset_index()
ex_df['idr'] = ex_df['idr'].astype(int).astype(str)

### Cleaning apoe df

In [84]:
apoe_df['idr'] = apoe_df['idr'].astype(int).astype(str)

### Cleaning demographic df

In [85]:
age_cols = [x for x in demo_df.columns if "AGE" in x]
demo_df = demo_df[['idr','SEX']+age_cols]
demo_df['idr'] = demo_df['idr'].astype(int).astype(str)

### Filtering for Exam 7


In [86]:
w7 = df[(df['idtype']==1) & 
        ((df['alteridtype']==0) & (df['SPELLBEGIN']<359) & (df['SPELLEND']> 359)) | 
        ((df['alteridtype']==1) & (df['SPELLBEGIN']<358) & (df['SPELLEND']> 358))]

w7[['idr','sharealterid']] = w7[['idr','sharealterid']].astype(int).astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


### Creating new categorical variables

In [87]:
w7['RELTYPE'] = w7['ALTERTYPE'].apply(lambda x: 'NONRELATIVE' if 'NR' in x else 'RELATIVE' )
# w7_df = w7.groupby(['idr','sharealterid','RELTYPE'],as_index=False).first()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  w7['RELTYPE'] = w7['ALTERTYPE'].apply(lambda x: 'NONRELATIVE' if 'NR' in x else 'RELATIVE' )


In [88]:
apoe_df['APOE4'] = apoe_df['APOE'].apply(lambda x: '+' if x in [34,24, 44] else ('-' if x not in [34,24,44] else x))



### Removing duplicate ties, keeping only closest tie categories

In [89]:
tie_set = set(list(zip(w7['idr'],w7['sharealterid'])))
tie_list = list(zip(w7['idr'],w7['sharealterid']))
#3000 or so cases of multiple ego/alter ties
len(tie_list) - len(tie_set)

2412

In [90]:
dupe_set = set([x for x in tie_list if tie_list.count(x) > 1])
dupe_df = w7[w7.apply(lambda x: tuple(x[['idr','sharealterid']].values) in dupe_set,axis=1)]
dupe_df.shape

(4090, 34)

In [91]:
#for ranking
tie_rank = dict(zip(['SPOUSE',
           'SAMEADNREL',
           'INGHBRNREL',
           'N25MNREL',
           'N100MNREL',
           'RELATIVENR',
           'FRIENDNR',
           'COWORKERNR'],range(1,9)))

In [92]:
dupe_df['rank']= dupe_df['ALTERTYPE'].replace(tie_rank)
dupe_df = dupe_df.sort_values(by=['idr','sharealterid','rank','ALTERTYPE'])
dupe_group_df = dupe_df.groupby(['idr','sharealterid'],as_index=False).first()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dupe_df['rank']= dupe_df['ALTERTYPE'].replace(tie_rank)


In [93]:
w7 = w7[w7.apply(lambda x: tuple(x[['idr','sharealterid']].values) not in dupe_set,axis=1)]

w7_df = pd.concat([w7,dupe_group_df.drop('rank',axis=1)])
w7_df['RELTYPE'] = w7_df['ALTERTYPE'].apply(lambda x: 'NONRELATIVE' if 'NR' in x else 'RELATIVE' )
w7_df['FAMILIAL']=w7_df['RELTYPE']
# w7_df = w7_df.drop(w7_df.columns[w7_df.isna().sum()>0],axis=1)

In [94]:
w7_df

Unnamed: 0,ALTERTYPE,CAUSEINIT,CAUSESEVERED,DISTMI1,DISTMI2,DISTMI3,DISTMI4,DISTMI5,DISTMI6,DISTMI7,DISTMI8,EGO_TREIMAN1,EGO_TREIMAN2,EGO_TREIMAN3,EGO_TREIMAN4,EGO_TREIMAN5,EGO_TREIMAN6,EGO_TREIMAN7,EGO_TREIMAN8,ALTER_TREIMAN1,ALTER_TREIMAN2,ALTER_TREIMAN3,ALTER_TREIMAN4,ALTER_TREIMAN5,ALTER_TREIMAN6,ALTER_TREIMAN7,ALTER_TREIMAN8,SPELLBEGIN,SPELLEND,idtype,alteridtype,idr,sharealterid,RELTYPE,FAMILIAL
1,SPOUSE,NAMED,CENSORED,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,49.0,49.0,49.0,49.0,53.0,49.0,49.0,,,,33.0,,,,,1.0,457.0,1.0,1.0,2450817372,2450616932,RELATIVE,RELATIVE
2,RELATIVENR,NAMED,CENSORED,2470.044117,2469.136853,2476.733168,2476.733168,2476.733168,537.573444,538.721584,538.721584,,49.0,49.0,49.0,49.0,53.0,49.0,49.0,,,,,,,,,238.0,457.0,1.0,1.0,2450817372,2450896375,NONRELATIVE,NONRELATIVE
42,SISTER,NAMED,CENSORED,11.955119,8.806368,14.242033,44.314864,15.900695,210.678908,210.678908,234.479221,,,42.0,42.0,42.0,,,,,,,,,,,,1.0,457.0,1.0,1.0,2450821577,2450526418,RELATIVE,RELATIVE
66,SPOUSE,NAMED,EGO DEATH,0.000000,0.000000,0.000000,1210.561323,0.000000,0.000000,0.000000,,32.0,,,,,,,,53.0,53.0,38.0,38.0,,,34.0,,1.0,385.0,1.0,1.0,2450222551,2450746521,RELATIVE,RELATIVE
82,SPOUSE,NAMED,CENSORED,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,,,,,,,,56.0,56.0,,,,,,57.0,1.0,457.0,1.0,1.0,2450600263,2450345955,RELATIVE,RELATIVE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1673,N25MNREL,NAMED,ALTER DEATH,0.012018,0.012018,2.715875,2.715875,2.715875,2.715875,0.012018,,,37.0,37.0,,,,,57.0,53.0,,,,,,,,28.0,361.0,1.0,0.0,2450946221,2450808174,NONRELATIVE,NONRELATIVE
1674,N25MNREL,NAMED,NOT NAMED,4.045757,0.008806,0.008806,0.008806,0.008806,0.008806,0.008806,18.582936,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,57.0,57.0,57.0,57.0,,57.0,,,125.0,425.0,1.0,1.0,2450946437,2450476340,NONRELATIVE,NONRELATIVE
1675,N25MNREL,NAMED,NOT NAMED,4.045757,0.008806,0.008806,0.008806,0.008806,0.008806,0.008806,18.582936,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,42.0,42.0,,50.0,50.0,,,,118.0,425.0,1.0,1.0,2450946437,2450843719,NONRELATIVE,NONRELATIVE
1676,N25MNREL,NAMED,EGO DEATH,0.011518,0.011518,0.011518,0.011518,0.011518,0.011518,0.011518,,57.0,57.0,57.0,57.0,57.0,57.0,57.0,57.0,,,,,,,,,37.0,406.0,0.0,1.0,2450946581,2450798167,NONRELATIVE,NONRELATIVE


### Merge dfs

In [95]:
comb_df = w7_df.merge(
    demo_df,on='idr',how='left').merge(
    ex_df,on='idr',how='left').merge(
    ed_df,on='idr',how='left').merge(
    apoe_df.drop('idtype',axis=1),on='idr',how='left')


In [96]:
comb_df

Unnamed: 0,ALTERTYPE,CAUSEINIT,CAUSESEVERED,DISTMI1,DISTMI2,DISTMI3,DISTMI4,DISTMI5,DISTMI6,DISTMI7,DISTMI8,EGO_TREIMAN1,EGO_TREIMAN2,EGO_TREIMAN3,EGO_TREIMAN4,EGO_TREIMAN5,EGO_TREIMAN6,EGO_TREIMAN7,EGO_TREIMAN8,ALTER_TREIMAN1,ALTER_TREIMAN2,ALTER_TREIMAN3,ALTER_TREIMAN4,ALTER_TREIMAN5,ALTER_TREIMAN6,ALTER_TREIMAN7,ALTER_TREIMAN8,SPELLBEGIN,SPELLEND,idtype,alteridtype,idr,sharealterid,RELTYPE,FAMILIAL,SEX,AGE1,AGE2,AGE3,AGE4,AGE5,AGE6,AGE7,AGE8,AGE9,G587,G588,G589,G590,G591,G592,G593,G594,G595,G596,G597,G598,G599,G600,G601,G602,G603,G604,G605,G606,G608,G609,G610,G611,G612,G613,G614,G615,G616,G617,G618,G619,G620,G621,G622,G623,G624,G626,G627,G629,G630,G632,G633,G635,G636,G638,G639,G641,G642,G644,G645,G646,G647,G648,G649,G650,G651,G652,G653,G654,G655,G656,G657,EDUCG,EDU_COLLEGE,EDU_HS,APOE,APOE4
0,SPOUSE,NAMED,CENSORED,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,49.0,49.0,49.0,49.0,53.0,49.0,49.0,,,,33.0,,,,,1.0,457.0,1.0,1.0,2450817372,2450616932,RELATIVE,RELATIVE,2.0,35.0,,,50.0,,,60.0,,,2.0,0.0,0.0,3.0,1.0,1.0,0.0,3.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,888.0,8.0,8.0,1.0,1.0,1.0,1.0,1.0,5.0,8.0,8.0,1.0,5.0,1.0,5.0,1.0,5.0,1.0,1.0,2.0,0.0,1.0,2.0,0.0,1.0,3.0,4.0,4.0,4.0,3.0,2.0,Some college,HS grad,33.0,-
1,RELATIVENR,NAMED,CENSORED,2470.044117,2469.136853,2476.733168,2476.733168,2476.733168,537.573444,538.721584,538.721584,,49.0,49.0,49.0,49.0,53.0,49.0,49.0,,,,,,,,,238.0,457.0,1.0,1.0,2450817372,2450896375,NONRELATIVE,NONRELATIVE,2.0,35.0,,,50.0,,,60.0,,,2.0,0.0,0.0,3.0,1.0,1.0,0.0,3.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,888.0,8.0,8.0,1.0,1.0,1.0,1.0,1.0,5.0,8.0,8.0,1.0,5.0,1.0,5.0,1.0,5.0,1.0,1.0,2.0,0.0,1.0,2.0,0.0,1.0,3.0,4.0,4.0,4.0,3.0,2.0,Some college,HS grad,33.0,-
2,SISTER,NAMED,CENSORED,11.955119,8.806368,14.242033,44.314864,15.900695,210.678908,210.678908,234.479221,,,42.0,42.0,42.0,,,,,,,,,,,,1.0,457.0,1.0,1.0,2450821577,2450526418,RELATIVE,RELATIVE,2.0,35.0,,47.0,51.0,54.0,58.0,59.0,66.0,71.0,1.0,0.0,1.0,3.0,0.0,1.0,0.0,3.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,888.0,8.0,8.0,1.0,5.0,1.0,5.0,1.0,5.0,8.0,8.0,1.0,5.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,2.0,0.0,1.0,3.0,1.0,4.0,3.0,4.0,1.0,No college,HS grad,34.0,+
3,SPOUSE,NAMED,EGO DEATH,0.000000,0.000000,0.000000,1210.561323,0.000000,0.000000,0.000000,,32.0,,,,,,,,53.0,53.0,38.0,38.0,,,34.0,,1.0,385.0,1.0,1.0,2450222551,2450746521,RELATIVE,RELATIVE,1.0,54.0,,66.0,70.0,,,79.0,,,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.0,8.0,8.0,8.0,8.0,8.0,,,,,,,,,0.0,0.0,0.0,0.0,0.0,4.0,,1.0,3.0,0.0,3.0,0.0,3.0,,,,,
4,SPOUSE,NAMED,CENSORED,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,,,,,,,,56.0,56.0,,,,,,57.0,1.0,457.0,1.0,1.0,2450600263,2450345955,RELATIVE,RELATIVE,2.0,36.0,,,,,,,67.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17221,N25MNREL,NAMED,ALTER DEATH,0.012018,0.012018,2.715875,2.715875,2.715875,2.715875,0.012018,,,37.0,37.0,,,,,57.0,53.0,,,,,,,,28.0,361.0,1.0,0.0,2450946221,2450808174,NONRELATIVE,NONRELATIVE,2.0,31.0,38.0,43.0,46.0,50.0,54.0,56.0,62.0,68.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,888.0,8.0,8.0,1.0,5.0,1.0,5.0,1.0,5.0,8.0,8.0,1.0,5.0,1.0,5.0,0.0,0.0,2.0,2.0,1.0,1.0,1.0,5.0,0.0,1.0,3.0,3.0,4.0,4.0,4.0,2.0,Some college,HS grad,33.0,-
17222,N25MNREL,NAMED,NOT NAMED,4.045757,0.008806,0.008806,0.008806,0.008806,0.008806,0.008806,18.582936,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,57.0,57.0,57.0,57.0,,57.0,,,125.0,425.0,1.0,1.0,2450946437,2450476340,NONRELATIVE,NONRELATIVE,1.0,29.0,38.0,42.0,45.0,49.0,53.0,56.0,63.0,69.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,888.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,3.0,2.0,3.0,2.0,2.0,3.0,Some college,HS grad,33.0,-
17223,N25MNREL,NAMED,NOT NAMED,4.045757,0.008806,0.008806,0.008806,0.008806,0.008806,0.008806,18.582936,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,42.0,42.0,,50.0,50.0,,,,118.0,425.0,1.0,1.0,2450946437,2450843719,NONRELATIVE,NONRELATIVE,1.0,29.0,38.0,42.0,45.0,49.0,53.0,56.0,63.0,69.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,888.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,3.0,2.0,3.0,2.0,2.0,3.0,Some college,HS grad,33.0,-
17224,N25MNREL,NAMED,EGO DEATH,0.011518,0.011518,0.011518,0.011518,0.011518,0.011518,0.011518,,57.0,57.0,57.0,57.0,57.0,57.0,57.0,57.0,,,,,,,,,37.0,406.0,0.0,1.0,2450946581,2450798167,NONRELATIVE,NONRELATIVE,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### Creating new categorical variables for age, cesd, etc.

In [97]:
comb_df['AGE35']= comb_df['AGE7'].apply(lambda x: '<35' if x<35 else '>=35')
comb_df['AGE65']= comb_df['AGE7'].apply(lambda x: '<65' if x<65 else '>=65')
comb_df['AGE_Q']= pd.cut(comb_df['AGE7'],4)
comb_df['AGE_10']= pd.cut(comb_df['AGE7'],range(0,110,10),labels=['<=10','<=20','<=30','<=40','<=50',
                                                                 '<=60','<=70','<=80','<=90','<=100'])

In [98]:
comb_df['CESD_TOT'] = comb_df.loc[:,'G587':'G606'].apply(lambda x: sum(x),axis=1)
comb_df['CESD_16'] = comb_df['CESD_TOT'].apply(lambda x: '>=16' if x>=16 else ('<16' if x<16 else x))
comb_df['CESD_20'] = comb_df['CESD_TOT'].apply(lambda x: '>=20' if x>=20 else ('<20' if x<20 else x))

In [99]:
comb_df['CESD_LONELINESS'] = comb_df['G600']
comb_df['LONELY_3'] = comb_df['CESD_LONELINESS'].apply(lambda x: '>=3' if x>=2 else ('<3' if x<3 else x))

In [100]:
comb_df['SNI_TOT'] = comb_df.loc[:,'G645':'G657'].apply(lambda x: sum(x),axis=1)
comb_df['SNI_2'] = comb_df['SNI_TOT'].apply(lambda x: '>2' if x>2 else ('<=2' if x<=2 else x))

In [101]:
comb_df['DIST1_5'] = comb_df['DISTMI1'].apply(lambda x: 'NEAR' if x<= 5 else 'FAR')
comb_df['DIST7_5'] = comb_df['DISTMI7'].apply(lambda x: 'NEAR' if x<= 5 else 'FAR')

CES-D: G587-G606

Loneliness : G600

SNI: G645-G657

### Create Diversity Index

In [102]:
def create_ratio(df,attr,val):
    ratio_list = []
    ego_list = df['idr'].unique()

    for e in ego_list:
        ego_df = df[df['idr']==e]
        alter_list = ego_df['sharealterid'].unique()  
        net_df = df[df['idr'].isin(alter_list)].groupby(['idr'],as_index=False).first()
        div_dict = dict(net_df[attr].value_counts())
        
        if val in div_dict:
            ratio = float(div_dict[val]/sum(div_dict.values()))
            ratio_list.append(ratio)
        else:
            ratio_list.append(0)
            
    res_df = pd.DataFrame({'idr':ego_list,(attr+'_'+str(val)+'_RATIO'):ratio_list})

    return res_df
        

In [103]:
### NEED TO ADD DISTANCE info
ratio_df = pd.DataFrame({'idr':comb_df['idr'].unique()})
for attr,val in [('FAMILIAL','RELATIVE'),('APOE4','+'),('DIST1_5','NEAR'),('DIST7_5','NEAR')]:
    temp = create_ratio(comb_df,attr,val)
    ratio_df = ratio_df.merge(temp,on='idr',how='left')

In [104]:
def create_diversity_index(df,attr):
    shannon_list = []
#     simpson_list = []
    ego_list = df['idr'].unique()
    
    for e in ego_list:
        #select network
        ego_df = df[df['idr']==e]
        alter_list = ego_df['sharealterid'].unique()
        #get network info using alters 'ego' attributes
        net_df = df[df['idr'].isin(alter_list)].groupby(['idr'],as_index=False).first()
        div_dict = dict(net_df[attr].value_counts())

        #NA if no tie info
        if div_dict:
        
        #shannon index
            shannon_h = 0
            for cat in div_dict:
                p = float(div_dict[cat]/sum(div_dict.values()))
                temp = float(p*(np.log(p)))
                if np.isfinite(temp):
                    shannon_h += temp      
            if shannon_h < 0:
                shannon_h = -(shannon_h)
            shannon_list.append(shannon_h)
        
        
        #simpson index
#             simpson_d = 0
#             for cat in div_dict:
#                 p = float(div_dict[cat]/sum(div_dict.values()))
#                 temp = float(p**2)
#                 simpson_d+= temp
#             simpson_d = 1 - simpson_d
#             simpson_list.append(simpson_d)

        
        else:
            shannon_list.append(np.nan)
#             simpson_list.append(np.nan)

#     new_df = pd.DataFrame({'idr':ego_list,(attr+'_SHANNON'):shannon_list,(attr+'_SIMPSON'):simpson_list})
    new_df = pd.DataFrame({'idr':ego_list,(attr+'_SHANNON'):shannon_list})

    return new_df
    

In [105]:
div_df= pd.DataFrame({'idr':comb_df['idr'].unique()})
for col in ['SEX','RELTYPE','ALTERTYPE','AGE35','AGE65','AGE_Q','AGE_10','SNI_TOT','SNI_2','EDU_COLLEGE','EDU_HS']:
    temp = create_diversity_index(comb_df,col)
    div_df = div_df.merge(temp,on='idr',how='left')

  temp = float(p*(np.log(p)))
  temp = float(p*(np.log(p)))
  p = float(div_dict[cat]/sum(div_dict.values()))
  temp = float(p*(np.log(p)))


In [106]:
## TO DO: check if these egos really dont have alters
div_df.isna().sum()

idr                      0
SEX_SHANNON            101
RELTYPE_SHANNON          8
ALTERTYPE_SHANNON        8
AGE35_SHANNON            8
AGE65_SHANNON            8
AGE_Q_SHANNON            0
AGE_10_SHANNON           0
SNI_TOT_SHANNON        454
SNI_2_SHANNON          454
EDU_COLLEGE_SHANNON    441
EDU_HS_SHANNON         441
dtype: int64

In [107]:
div_df.shape

(4578, 12)

In [108]:
div_df.describe()

Unnamed: 0,SEX_SHANNON,RELTYPE_SHANNON,ALTERTYPE_SHANNON,AGE35_SHANNON,AGE65_SHANNON,AGE_Q_SHANNON,AGE_10_SHANNON,SNI_TOT_SHANNON,SNI_2_SHANNON,EDU_COLLEGE_SHANNON,EDU_HS_SHANNON
count,4477.0,4570.0,4570.0,4570.0,4570.0,4578.0,4578.0,4124.0,4124.0,4137.0,4137.0
mean,0.376263,0.29488,0.575432,0.000444,0.312609,0.297783,0.376263,0.719185,0.000289,0.263395,0.125605
std,0.32257,0.311491,0.462008,0.015699,0.313123,0.376459,0.420499,0.624682,0.013311,0.312683,0.24292
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.562335,0.0,0.636514,0.0,0.348832,0.0,0.0,0.693147,0.0,0.0,0.0
75%,0.682908,0.636514,0.9557,0.0,0.636514,0.636514,0.693147,1.098612,0.0,0.636514,0.0
max,0.693147,0.693147,1.970181,0.636514,0.693147,1.386294,1.609438,2.479956,0.693147,0.693147,0.693147


In [113]:
export_df = comb_df[['idr',
                     'AGE35',
                     'AGE65',
                     'AGE_Q',
                     'AGE_10',
                     'sharealterid',
                     'RELTYPE',
                     'FAMILIAL',
                     'ALTERTYPE',
                     'SEX',
                     'AGE7',
                     'idtype',
                     'alteridtype',
                     'SPELLBEGIN',
                     'SPELLEND',
                     'APOE',
                     'EDUCG',
                     'CESD_TOT',
                     'CESD_16',
                     'CESD_20',
                     'CESD_LONELINESS',
                     'LONELY_3',
                     'SNI_TOT',
                     'SNI_2',
                     'DIST1_5',
                     'DIST7_5'
                    ]].merge(div_df,on='idr').merge(ratio_df,on='idr')

In [114]:
export_df.groupby('idr',as_index=False).first()

Unnamed: 0,idr,AGE35,AGE65,AGE_Q,AGE_10,sharealterid,RELTYPE,FAMILIAL,ALTERTYPE,SEX,AGE7,idtype,alteridtype,SPELLBEGIN,SPELLEND,APOE,EDUCG,CESD_TOT,CESD_16,CESD_20,CESD_LONELINESS,LONELY_3,SNI_TOT,SNI_2,DIST1_5,DIST7_5,SEX_SHANNON,RELTYPE_SHANNON,ALTERTYPE_SHANNON,AGE35_SHANNON,AGE65_SHANNON,AGE_Q_SHANNON,AGE_10_SHANNON,SNI_TOT_SHANNON,SNI_2_SHANNON,EDU_COLLEGE_SHANNON,EDU_HS_SHANNON,FAMILIAL_RELATIVE_RATIO,APOE4_+_RATIO,DIST1_5_NEAR_RATIO,DIST7_5_NEAR_RATIO
0,2450200061,>=35,<65,"(47.25, 61.5]",<=60,2450649612,NONRELATIVE,NONRELATIVE,N100MNREL,1.0,56.0,1.0,1.0,40.0,457.0,34.0,,16.0,>=16,<20,0.0,<3,29.0,>2,NEAR,NEAR,0.693147,0.636514,1.149060,0.0,0.636514,0.955700,1.277034,1.747868,0.0,0.562335,0.000000,0.333333,0.50,1.000000,0.666667
1,2450200238,>=35,>=65,,,2450459583,RELATIVE,RELATIVE,BROTHER,2.0,,1.0,1.0,1.0,457.0,,,,,,,,,,NEAR,NEAR,0.000000,0.000000,0.693147,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1.000000,0.00,1.000000,0.500000
2,2450200493,>=35,>=65,,,2450641679,RELATIVE,RELATIVE,SISTER,2.0,,1.0,1.0,1.0,457.0,,,,,,,,,,FAR,FAR,0.693147,0.000000,0.500402,0.0,0.673012,0.636514,0.636514,1.098612,0.0,0.000000,0.000000,1.000000,0.75,0.600000,0.000000
3,2450200742,>=35,<65,"(47.25, 61.5]",<=60,2450709923,RELATIVE,RELATIVE,SPOUSE,1.0,54.0,1.0,1.0,1.0,457.0,34.0,3.0,12.0,<16,<20,0.0,<3,27.0,>2,NEAR,NEAR,0.661563,0.661563,1.255482,0.0,0.376770,0.562335,1.039721,1.906155,0.0,0.000000,0.000000,0.625000,0.25,0.750000,0.875000
4,2450201178,>=35,<65,"(32.943, 47.25]",<=40,2450701398,RELATIVE,RELATIVE,SISTER,2.0,37.0,1.0,1.0,1.0,457.0,33.0,1.0,29.0,>=16,>=20,1.0,<3,25.0,>2,NEAR,NEAR,0.636514,0.450561,1.011404,0.0,0.450561,0.500402,0.673012,1.609438,0.0,0.636514,0.000000,0.833333,0.00,0.666667,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4573,2450949019,>=35,<65,"(61.5, 75.75]",<=70,2450433007,NONRELATIVE,NONRELATIVE,FRIENDNR,1.0,63.0,1.0,1.0,271.0,430.0,44.0,1.0,12.0,<16,<20,0.0,<3,,,FAR,FAR,0.000000,0.000000,0.693147,0.0,0.693147,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1.000000,0.50,0.500000,0.500000
4574,2450949062,>=35,<65,"(47.25, 61.5]",<=60,2450608245,NONRELATIVE,NONRELATIVE,N100MNREL,2.0,56.0,1.0,1.0,314.0,448.0,34.0,,8.0,<16,<20,0.0,<3,32.0,>2,FAR,NEAR,0.636514,0.693147,0.693147,0.0,0.562335,0.636514,0.636514,1.098612,0.0,0.693147,0.000000,0.500000,0.00,0.750000,1.000000
4575,2450949332,>=35,<65,"(47.25, 61.5]",<=60,2450392366,RELATIVE,RELATIVE,SPOUSE,2.0,57.0,1.0,1.0,1.0,457.0,33.0,3.0,12.0,<16,<20,0.0,<3,27.0,>2,NEAR,NEAR,0.693147,0.562335,1.039721,0.0,0.693147,0.000000,0.000000,0.693147,0.0,0.693147,0.693147,0.750000,0.00,1.000000,1.000000
4576,2450949473,>=35,>=65,"(75.75, 90.0]",<=80,2450748270,RELATIVE,RELATIVE,SPOUSE,2.0,76.0,1.0,1.0,1.0,380.0,33.0,1.0,14.0,<16,<20,0.0,<3,20.0,>2,NEAR,NEAR,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1.000000,0.00,1.000000,1.000000


In [115]:
export_df['AGE7']

0        60.0
1        60.0
2        59.0
3        79.0
4         NaN
         ... 
17221     NaN
17222     NaN
17223     NaN
17224     NaN
17225     NaN
Name: AGE7, Length: 17226, dtype: float64

In [116]:
#to share
export_df.to_csv('fhs_sn_diversity_wave7.csv')