### Social Network Analysis
##### Combine new variables and create diversity indices
##### Oct 06, 2021

In [62]:
import os
import pandas as pd
import numpy as np
from math import log as ln
import networkx as nx
from pyvis.network import Network
import matplotlib.pyplot as plt

In [63]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_colwidth', None)

### Loading datasets

In [64]:
df = pd.read_sas('vr_sntwk_2008_m_0641s_v2_16.sas7bdat',format = 'sas7bdat', encoding="unicode_escape")
demo_df = pd.read_sas('vr_wkthru_ex09_1_1001s_16.sas7bdat',format = 'sas7bdat', encoding="unicode_escape")
apoe_df = pd.read_sas('coh_off_apoe_16.sas7bdat',format = 'sas7bdat', encoding="unicode_escape")
ex_df = pd.read_sas('ex1_7s_v2_16.sas7bdat',format = 'sas7bdat', encoding="unicode_escape",index='idr')
ed_df = pd.read_sas('vr_np_2018_a_1185s_19.sas7bdat',format = 'sas7bdat', encoding="unicode_escape")


  rslt[name] = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d")
  rslt[name] = self._string_chunk[js, :]


### Cleaning education df

In [65]:
ed_df['idr'] = ed_df['idr'].astype(int).astype(str)
ed_df = ed_df.groupby('idr',as_index=False).first()
ed_df['EDU_COLLEGE'] = ed_df['EDUCG'].apply(lambda x: 'Some college' if x>1 else ('No college' if x<=1 else x))
ed_df = ed_df[['idr','EDUCG','EDU_COLLEGE']]

### Cleaning exam df

In [66]:
ex_df = ex_df.iloc[:,446:509].reset_index()
ex_df['idr'] = ex_df['idr'].astype(int).astype(str)

### Cleaning apoe df

In [67]:
apoe_df['idr'] = apoe_df['idr'].astype(int).astype(str)

### Cleaning demographic df

In [68]:
age_cols = [x for x in demo_df.columns if "AGE" in x]
demo_df = demo_df[['idr','SEX']+age_cols]
demo_df['idr'] = demo_df['idr'].astype(int).astype(str)

### Filtering for Exam 7


In [69]:
w7 = df[(df['idtype']==1) & 
        ((df['alteridtype']==0) & (df['SPELLBEGIN']<359) & (df['SPELLEND']> 359)) | 
        ((df['alteridtype']==1) & (df['SPELLBEGIN']<358) & (df['SPELLEND']> 358))]

w7[['idr','sharealterid']] = w7[['idr','sharealterid']].astype(int).astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


### Creating new categorical variables

In [70]:
w7['RELTYPE'] = w7['ALTERTYPE'].apply(lambda x: 'NONRELATIVE' if 'NR' in x else 'RELATIVE' )
w7_df = w7.groupby(['idr','sharealterid','RELTYPE'],as_index=False).first()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  w7['RELTYPE'] = w7['ALTERTYPE'].apply(lambda x: 'NONRELATIVE' if 'NR' in x else 'RELATIVE' )


In [71]:
apoe_df['APOE4'] = apoe_df['APOE'].apply(lambda x: '+' if x in [34,24, 44] else ('-' if x not in [34,24,44] else x))



### Removing duplicate ties, keeping only closest tie categories

In [72]:
tie_set = set(list(zip(w7['idr'],w7['sharealterid'])))
tie_list = list(zip(w7['idr'],w7['sharealterid']))
#3000 or so cases of multiple ego/alter ties
len(tie_list) - len(tie_set)



2412

In [73]:
dupe_set = set([x for x in tie_list if tie_list.count(x) > 1])
dupe_df = w7[w7.apply(lambda x: tuple(x[['idr','sharealterid']].values) in dupe_set,axis=1)]
dupe_df.shape

(4090, 34)

In [74]:
#for ranking
tie_rank = dict(zip(['SPOUSE',
           'SAMEADNREL',
           'INGHBRNREL',
           'N25MNREL',
           'N100MNREL',
           'RELATIVENR',
           'FRIENDNR',
           'COWORKERNR'],range(1,9)))
tie_rank

{'SPOUSE': 1,
 'SAMEADNREL': 2,
 'INGHBRNREL': 3,
 'N25MNREL': 4,
 'N100MNREL': 5,
 'RELATIVENR': 6,
 'FRIENDNR': 7,
 'COWORKERNR': 8}

In [75]:
dupe_df['rank']= dupe_df['ALTERTYPE'].replace(tie_rank)
dupe_df = dupe_df.sort_values(by=['idr','sharealterid','rank','ALTERTYPE'])
dupe_group_df = dupe_df.groupby(['idr','sharealterid'],as_index=False).first()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dupe_df['rank']= dupe_df['ALTERTYPE'].replace(tie_rank)


In [76]:
w7 = w7[w7.apply(lambda x: tuple(x[['idr','sharealterid']].values) not in dupe_set,axis=1)]



In [77]:
w7_df = pd.concat([w7,dupe_group_df.drop('rank',axis=1)])
w7_df['RELTYPE'] = w7_df['ALTERTYPE'].apply(lambda x: 'NONRELATIVE' if 'NR' in x else 'RELATIVE' )
w7_df = w7_df.drop(w7_df.columns[w7_df.isna().sum()>0],axis=1)



### Merge dfs

In [78]:
comb_df = w7_df.merge(
    demo_df,on='idr',how='left').merge(
    ex_df,on='idr',how='left').merge(
    ed_df,on='idr',how='left').merge(
    apoe_df.drop('idtype',axis=1),on='idr',how='left')


### Creating new categorical variables for age, cesd, etc.

In [79]:
comb_df['AGE35']= comb_df['AGE7'].apply(lambda x: '<35' if x<35 else '>=35')
comb_df['AGE65']= comb_df['AGE7'].apply(lambda x: '<65' if x<65 else '>=65')
comb_df['AGE_Q']= pd.cut(comb_df['AGE7'],4)
comb_df['AGE_10']= pd.cut(comb_df['AGE7'],range(0,110,10),labels=['<=10','<=20','<=30','<=40','<=50',
                                                                 '<=60','<=70','<=80','<=90','<=100'])

In [80]:
comb_df['CESD_TOT'] = comb_df.loc[:,'G587':'G606'].apply(lambda x: sum(x),axis=1)
comb_df['CESD_16'] = comb_df['CESD_TOT'].apply(lambda x: '>=16' if x>=16 else ('<16' if x<16 else x))
comb_df['CESD_20'] = comb_df['CESD_TOT'].apply(lambda x: '>=20' if x>=20 else ('<20' if x<20 else x))

In [81]:
comb_df['CESD_LONELINESS'] = comb_df['G600']
comb_df['LONELY_3'] = comb_df['CESD_LONELINESS'].apply(lambda x: '>=3' if x>=2 else ('<3' if x<3 else x))

In [82]:
comb_df['SNI_TOT'] = comb_df.loc[:,'G645':'G657'].apply(lambda x: sum(x),axis=1)
comb_df['SNI_2'] = comb_df['SNI_TOT'].apply(lambda x: '>2' if x>2 else ('<=2' if x<=2 else x))

CES-D: G587-G606

Loneliness : G600

SNI: G645-G657

### Create Diversity Index

In [83]:
#individual network indices
#TO DO : get rid of for loop

def shannon_index(div_dict):

    if div_dict:
        shannon_h = 0
        for cat in div_dict:
            p = float(div_dict[cat]/sum(div_dict.values()))
            shannon_h += float(p*(np.log(p)))      
        if shannon_h < 0:
            shannon_h = -(shannon_h)
    else:
        shannon_h = np.nan
    return shannon_h

def create_diversity_index(df,attr):
#     shannon_list = []
#     simpson_list = []
    ego_alter_list = [df[df['idr']==ego]['sharealterid'].unique() for ego in df['idr'].unique()]
    network_df_list = [df[df['idr'].isin(alter_list)].groupby(['idr'],as_index=False).first() for alter_list in ego_alter_list]
    div_dict_list = [dict(net_df[attr].value_counts()) for net_df in network_df_list]
    print(len(ego_alter_list))
    print(len(network_df_list))
    print(len(div_dict_list))

    h = [shannon_index(div_dict) for div_dict in div_dict_list]
    print(len(h))
    return h

In [84]:
def create_diversity_index(df,attr):
    shannon_list = []
    simpson_list = []
    ego_list = df['idr'].unique()
    
    for e in ego_list:
        #select network
        ego_df = df[df['idr']==e]
        alter_list = ego_df['sharealterid'].unique()
        #get network info using alters 'ego' attributes
        net_df = df[df['idr'].isin(alter_list)].groupby(['idr'],as_index=False).first()
        div_dict = dict(net_df[attr].value_counts())

        #NA if no tie info
        if div_dict:
        
        #shannon index
            shannon_h = 0
            for cat in div_dict:
                p = float(div_dict[cat]/sum(div_dict.values()))
                temp = float(p*(np.log(p)))
                if np.isfinite(temp):
                    shannon_h += temp      
            if shannon_h < 0:
                shannon_h = -(shannon_h)
            shannon_list.append(shannon_h)
        
        
        #simpson index
            simpson_d = 0
            for cat in div_dict:
                p = float(div_dict[cat]/sum(div_dict.values()))
                temp = float(p**2)
                simpson_d+= temp
            simpson_d = 1 - simpson_d
            simpson_list.append(simpson_d)

        
        else:
            shannon_list.append(np.nan)
            simpson_list.append(np.nan)

    new_df = pd.DataFrame({'idr':ego_list,(attr+'_SHANNON'):shannon_list,(attr+'_SIMPSON'):simpson_list})
    return new_df
    

In [85]:
div_df= pd.DataFrame({'idr':comb_df['idr'].unique()})
for col in ['SEX','RELTYPE','ALTERTYPE','AGE35','AGE65','AGE_Q','AGE_10','CESD_16','CESD_20','LONELY_3','SNI_2','EDU_COLLEGE','APOE4']:
    temp = create_diversity_index(comb_df,col)
    div_df = div_df.merge(temp,on='idr',how='left')

  temp = float(p*(np.log(p)))
  temp = float(p*(np.log(p)))
  p = float(div_dict[cat]/sum(div_dict.values()))
  temp = float(p*(np.log(p)))
  p = float(div_dict[cat]/sum(div_dict.values()))


In [86]:
## TO DO: check if these egos really dont have alters
div_df.isna().sum()

idr                      0
SEX_SHANNON            101
SEX_SIMPSON            101
RELTYPE_SHANNON          8
RELTYPE_SIMPSON          8
ALTERTYPE_SHANNON        8
ALTERTYPE_SIMPSON        8
AGE35_SHANNON            8
AGE35_SIMPSON            8
AGE65_SHANNON            8
AGE65_SIMPSON            8
AGE_Q_SHANNON            0
AGE_Q_SIMPSON          375
AGE_10_SHANNON           0
AGE_10_SIMPSON         375
CESD_16_SHANNON        484
CESD_16_SIMPSON        484
CESD_20_SHANNON        484
CESD_20_SIMPSON        484
LONELY_3_SHANNON       410
LONELY_3_SIMPSON       410
SNI_2_SHANNON          454
SNI_2_SIMPSON          454
EDU_COLLEGE_SHANNON    474
EDU_COLLEGE_SIMPSON    474
APOE4_SHANNON          163
APOE4_SIMPSON          163
dtype: int64

In [87]:
div_df.shape

(4578, 27)

In [88]:
div_df.describe()

Unnamed: 0,SEX_SHANNON,SEX_SIMPSON,RELTYPE_SHANNON,RELTYPE_SIMPSON,ALTERTYPE_SHANNON,ALTERTYPE_SIMPSON,AGE35_SHANNON,AGE35_SIMPSON,AGE65_SHANNON,AGE65_SIMPSON,AGE_Q_SHANNON,AGE_Q_SIMPSON,AGE_10_SHANNON,AGE_10_SIMPSON,CESD_16_SHANNON,CESD_16_SIMPSON,CESD_20_SHANNON,CESD_20_SIMPSON,LONELY_3_SHANNON,LONELY_3_SIMPSON,SNI_2_SHANNON,SNI_2_SIMPSON,EDU_COLLEGE_SHANNON,EDU_COLLEGE_SIMPSON,APOE4_SHANNON,APOE4_SIMPSON
count,4477.0,4477.0,4570.0,4570.0,4570.0,4570.0,4570.0,4570.0,4570.0,4570.0,4578.0,4203.0,4578.0,4203.0,4094.0,4094.0,4094.0,4094.0,4168.0,4168.0,4124.0,4124.0,4104.0,4104.0,4415.0,4415.0
mean,0.376263,0.26511,0.29488,0.204059,0.575432,0.357378,0.000444,0.000294,0.312609,0.216665,0.297783,0.214951,0.376263,0.266341,0.271519,0.189425,0.156158,0.107113,0.06408,0.043079,0.000289,0.000199,0.25519,0.177981,0.229843,0.157626
std,0.32257,0.228933,0.311491,0.218866,0.462008,0.269587,0.015699,0.010613,0.313123,0.220182,0.376459,0.247685,0.420499,0.265125,0.314831,0.221616,0.26624,0.185221,0.181767,0.124564,0.013311,0.009243,0.311769,0.219297,0.295514,0.206135
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.562335,0.375,0.0,0.0,0.636514,0.444444,0.0,0.0,0.348832,0.197531,0.0,0.0,0.0,0.345679,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.682908,0.489796,0.636514,0.444444,0.9557,0.567901,0.0,0.0,0.636514,0.444444,0.636514,0.48,0.693147,0.5,0.636514,0.444444,0.37677,0.21875,0.0,0.0,0.0,0.0,0.636514,0.444444,0.562335,0.375
max,0.693147,0.5,0.693147,0.5,1.970181,0.846939,0.636514,0.444444,0.693147,0.5,1.386294,0.75,1.609438,0.8,0.693147,0.5,0.693147,0.5,0.693147,0.5,0.693147,0.5,0.693147,0.5,0.693147,0.5


In [89]:
export_df = comb_df[['idr',
                     'sharealterid',
                     'RELTYPE',
                     'ALTERTYPE',
                     'SEX',
                     'AGE7',
                     'idtype',
                     'alteridtype',
                     'SPELLBEGIN',
                     'SPELLEND',
                     'APOE',
                     'EDUCG',
                     'CESD_TOT',
                     'CESD_LONELINESS',
                     'SNI_TOT'
                    ]].merge(div_df,on='idr')

In [90]:
export_df.groupby('idr',as_index=False).first()

Unnamed: 0,idr,sharealterid,RELTYPE,ALTERTYPE,SEX,AGE1,idtype,alteridtype,SPELLBEGIN,SPELLEND,APOE,EDUCG,CESD_TOT,CESD_LONELINESS,SNI_TOT,SEX_SHANNON,SEX_SIMPSON,RELTYPE_SHANNON,RELTYPE_SIMPSON,ALTERTYPE_SHANNON,ALTERTYPE_SIMPSON,AGE35_SHANNON,AGE35_SIMPSON,AGE65_SHANNON,AGE65_SIMPSON,AGE_Q_SHANNON,AGE_Q_SIMPSON,AGE_10_SHANNON,AGE_10_SIMPSON,CESD_16_SHANNON,CESD_16_SIMPSON,CESD_20_SHANNON,CESD_20_SIMPSON,LONELY_3_SHANNON,LONELY_3_SIMPSON,SNI_2_SHANNON,SNI_2_SIMPSON,EDU_COLLEGE_SHANNON,EDU_COLLEGE_SIMPSON,APOE4_SHANNON,APOE4_SIMPSON
0,2450200061,2450649612,NONRELATIVE,N100MNREL,1.0,30.0,1.0,1.0,40.0,457.0,34.0,,16.0,0.0,29.0,0.693147,0.500000,0.636514,0.444444,1.149060,0.617284,0.0,0.0,0.636514,0.444444,0.955700,0.571429,1.277034,0.693878,0.682908,0.489796,0.598270,0.408163,0.410116,0.244898,0.0,0.0,0.562335,0.375000,0.693147,0.500
1,2450200238,2450459583,RELATIVE,BROTHER,2.0,16.0,1.0,1.0,1.0,457.0,,,,,,0.000000,0.000000,0.000000,0.000000,0.693147,0.500000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.693147,0.500000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000
2,2450200493,2450641679,RELATIVE,SISTER,2.0,26.0,1.0,1.0,1.0,457.0,,,,,,0.693147,0.500000,0.000000,0.000000,0.500402,0.320000,0.0,0.0,0.673012,0.480000,0.636514,0.444444,0.636514,0.444444,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.562335,0.375
3,2450200742,2450709923,RELATIVE,SPOUSE,1.0,27.0,1.0,1.0,1.0,457.0,34.0,3.0,12.0,0.0,27.0,0.661563,0.468750,0.661563,0.468750,1.255482,0.687500,0.0,0.0,0.376770,0.218750,0.562335,0.375000,1.039721,0.625000,0.636514,0.444444,0.636514,0.444444,0.376770,0.218750,0.0,0.0,0.000000,0.000000,0.562335,0.375
4,2450201178,2450701398,RELATIVE,SISTER,2.0,11.0,1.0,1.0,1.0,457.0,33.0,1.0,29.0,1.0,25.0,0.636514,0.444444,0.450561,0.277778,1.011404,0.611111,0.0,0.0,0.450561,0.277778,0.500402,0.320000,0.673012,0.480000,0.500402,0.320000,0.673012,0.480000,0.000000,0.000000,0.0,0.0,0.636514,0.444444,0.000000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4573,2450949019,2450433007,NONRELATIVE,FRIENDNR,1.0,40.0,1.0,1.0,271.0,430.0,44.0,1.0,12.0,0.0,,0.000000,0.000000,0.000000,0.000000,0.693147,0.500000,0.0,0.0,0.693147,0.500000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.693147,0.500
4574,2450949062,2450608245,NONRELATIVE,N100MNREL,2.0,27.0,1.0,1.0,314.0,448.0,34.0,,8.0,0.0,32.0,0.636514,0.444444,0.693147,0.500000,0.693147,0.500000,0.0,0.0,0.562335,0.375000,0.636514,0.444444,0.636514,0.444444,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.693147,0.500000,0.000000,0.000
4575,2450949332,2450392366,RELATIVE,SPOUSE,2.0,30.0,1.0,1.0,1.0,457.0,33.0,3.0,12.0,0.0,27.0,0.693147,0.500000,0.562335,0.375000,1.039721,0.625000,0.0,0.0,0.693147,0.500000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000
4576,2450949473,2450748270,RELATIVE,SPOUSE,2.0,51.0,1.0,1.0,1.0,380.0,33.0,1.0,14.0,0.0,20.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000


In [61]:
#to share
export_df.to_csv('fhs_sn_diversity_wave7.csv')