### Social Network Analysis
##### Combine new variables and create diversity indices
##### Aug 30, 2021

### Notes as of 11/10/2021:
- just keep shannon

In [1]:
import os
import pandas as pd
import numpy as np
from math import log as ln
import networkx as nx
from pyvis.network import Network
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_colwidth', None)

In [3]:
df = pd.read_sas('vr_sntwk_2008_m_0641s_v2_16.sas7bdat',format = 'sas7bdat', encoding="unicode_escape")
demo_df = pd.read_sas('vr_wkthru_ex09_1_1001s_16.sas7bdat',format = 'sas7bdat', encoding="unicode_escape")
apoe_df = pd.read_sas('coh_off_apoe_16.sas7bdat',format = 'sas7bdat', encoding="unicode_escape")
ex_df = pd.read_sas('ex1_7s_v2_16.sas7bdat',format = 'sas7bdat', encoding="unicode_escape",index='idr')
ed_df = pd.read_sas('vr_np_2018_a_1185s_19.sas7bdat',format = 'sas7bdat', encoding="unicode_escape")


  rslt[name] = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d")
  rslt[name] = self._string_chunk[js, :]


### Prepare dfs

In [5]:
ed_df['idr'] = ed_df['idr'].astype(int).astype(str)
ed_df = ed_df.groupby('idr',as_index=False).first()
ed_df['EDU_COLLEGE'] = ed_df['EDUCG'].apply(lambda x: 'Some college' if x>1 else 'No college')
ed_df['EDU_HS'] = ed_df['EDUCG'].apply(lambda x: 'HS grad' if x>=1 else 'No HS')

ed_df = ed_df[['idr','EDUCG','EDU_COLLEGE','EDU_HS']]

In [6]:
ed_df['EDU_COLLEGE'].value_counts()

Some college    4633
No college      2699
Name: EDU_COLLEGE, dtype: int64

In [7]:
ex_df = ex_df.iloc[:,446:509].reset_index()
ex_df['idr'] = ex_df['idr'].astype(int).astype(str)

In [8]:
apoe_df['idr'] = apoe_df['idr'].astype(int).astype(str)
apoe_df['APOE4'] = apoe_df['APOE'].apply(lambda x: '+' if x in [34,24, 44] else ('-' if x not in [34,24,44] else x))

In [9]:
age_cols = [x for x in demo_df.columns if "AGE" in x]
demo_df = demo_df[['idr','SEX']+age_cols]
demo_df['idr'] = demo_df['idr'].astype(int).astype(str)

### Select Wave 1

In [10]:
w1 = df[(df['idtype']==1) & 
    ((df['alteridtype']==0)&(df['SPELLBEGIN']<29) | (df['alteridtype']==1) & (df['SPELLBEGIN']<44)) &
    (df['SPELLEND']>44)].reset_index(drop=True)
w1[['idr','sharealterid']] = w1[['idr','sharealterid']].astype(int).astype(str)

In [11]:
w1['RELTYPE'] = w1['ALTERTYPE'].apply(lambda x: 'NONRELATIVE' if 'NR' in x else 'RELATIVE' )
w1_df = w1.groupby(['idr','sharealterid','RELTYPE'],as_index=False).first()

In [59]:
w1_df['FAMILIAL']=w1_df['RELTYPE']

#### Lots of duplicates, probably from ego/alter ties fulfilling the criteria for multiple categories. Drop duplicates, only keeping first entry for now
#### TO DO: should keep closest tie category 

In [60]:
comb_df = w1_df.merge(demo_df,on='idr',how='left').merge(ex_df,on='idr',how='left').merge(ed_df,on='idr',how='left').merge(apoe_df.drop('idtype',axis=1),on='idr',how='left')

In [61]:
comb_df['AGE35']= comb_df['AGE1'].apply(lambda x: 0 if x<35 else 1)
comb_df['AGE65']= comb_df['AGE1'].apply(lambda x: 0 if x<65 else 1)
comb_df['AGE_Q']= pd.cut(comb_df['AGE1'],4)
comb_df['AGE_10']= pd.cut(comb_df['AGE1'],range(0,110,10),labels=['<=10','<=20','<=30','<=40','<=50',
                                                              '<=60','<=70','<=80','<=90','<=100'])

CES-D: G587-G606

Loneliness : G600

SNI: G645-G657

In [62]:
comb_df['CESD_TOT'] = comb_df.loc[:,'G587':'G606'].apply(lambda x: sum(x),axis=1)
comb_df['CESD_16'] = comb_df['CESD_TOT'].apply(lambda x: '>=16' if x>=16 else ('<16' if x<16 else x))
comb_df['CESD_20'] = comb_df['CESD_TOT'].apply(lambda x: '>=20' if x>=20 else ('<20' if x<20 else x))

In [63]:
comb_df['CESD_LONELINESS'] = comb_df['G600']
comb_df['LONELY_3'] = comb_df['CESD_LONELINESS'].apply(lambda x: '>=3' if x>=2 else ('<3' if x<3 else x))

In [64]:
comb_df['SNI_TOT'] = comb_df.loc[:,'G645':'G657'].apply(lambda x: sum(x),axis=1)
comb_df['SNI_2'] = comb_df['SNI_TOT'].apply(lambda x: '>2' if x>2 else ('<=2' if x<=2 else x))

In [65]:
# [(c,comb_df[c].isna().sum()) for c in comb_df.columns if comb_df[c].isna().sum()>0]

### Create Diversity Index

In [67]:
def create_ratio(df,attr,val):
    ratio_list = []
    ego_list = df['idr'].unique()

    for e in ego_list:
        ego_df = df[df['idr']==e]
        alter_list = ego_df['sharealterid'].unique()  
        net_df = df[df['idr'].isin(alter_list)].groupby(['idr'],as_index=False).first()
        div_dict = dict(net_df[attr].value_counts())
        
        if val in div_dict:
            ratio = float(div_dict[val]/sum(div_dict.values()))
            ratio_list.append(ratio)
        else:
            ratio_list.append(0)
            
    res_df = pd.DataFrame({'idr':ego_list,(attr+'_'+val+'_RATIO'):ratio_list})

    return res_df
        

In [68]:
### NEED TO ADD DISTANCE info
ratio_df = pd.DataFrame({'idr':comb_df['idr'].unique()})
for attr,val in [('FAMILIAL','RELATIVE'),('APOE4','+')]:
    temp = create_ratio(comb_df,attr,val)
    ratio_df = ratio_df.merge(temp,on='idr',how='left')

In [71]:
#individual network indices
#TO DO : get rid of for loop

def create_diversity_index(df,attr):
    shannon_list = []
#     simpson_list = []
    ego_list = df['idr'].unique()
    
    for e in ego_list:
        #select network
        ego_df = df[df['idr']==e]
        alter_list = ego_df['sharealterid'].unique()
        #get network info using alters 'ego' attributes
        net_df = df[df['idr'].isin(alter_list)].groupby(['idr'],as_index=False).first()
        div_dict = dict(net_df[attr].value_counts())

        #NA if no tie info
        if div_dict:
        
        #shannon index
            shannon_h = 0
            for cat in div_dict:
                p = float(div_dict[cat]/sum(div_dict.values()))
                temp = float(p*(np.log(p)))
                if np.isfinite(temp):
                    shannon_h += temp      
            if shannon_h < 0:
                shannon_h = -(shannon_h)
            shannon_list.append(shannon_h)
        
        
        #simpson index
#             simpson_d = 0
#             for cat in div_dict:
#                 p = float(div_dict[cat]/sum(div_dict.values()))
#                 temp = float(p**2)
#                 simpson_d+= temp
#             simpson_d = 1 - simpson_d
#             simpson_list.append(simpson_d)

        
        else:
            shannon_list.append(np.nan)
#             simpson_list.append(np.nan)

#     new_df = pd.DataFrame({'idr':ego_list,(attr+'_SHANNON'):shannon_list,(attr+'_SIMPSON'):simpson_list})
    new_df = pd.DataFrame({'idr':ego_list,(attr+'_SHANNON'):shannon_list})

    return new_df
    

In [72]:
div_df= pd.DataFrame({'idr':comb_df['idr'].unique()})
for col in ['SEX','RELTYPE','ALTERTYPE','AGE35','AGE65','AGE_Q','AGE_10','SNI_TOT','SNI_2','EDU_COLLEGE','EDU_HS']:
    temp = create_diversity_index(comb_df,col)
    div_df = div_df.merge(temp,on='idr',how='left')

  temp = float(p*(np.log(p)))
  temp = float(p*(np.log(p)))
  p = float(div_dict[cat]/sum(div_dict.values()))
  temp = float(p*(np.log(p)))


In [79]:
## TO DO: check if these egos really dont have alters
div_df.isna().sum()

idr                      0
SEX_SHANNON            140
RELTYPE_SHANNON        140
ALTERTYPE_SHANNON      140
AGE35_SHANNON          140
AGE65_SHANNON          140
AGE_Q_SHANNON            0
AGE_10_SHANNON           0
SNI_TOT_SHANNON        600
SNI_2_SHANNON          600
EDU_COLLEGE_SHANNON    710
EDU_HS_SHANNON         710
dtype: int64

In [74]:
div_df.shape

(4941, 12)

In [75]:
div_df.describe()

Unnamed: 0,SEX_SHANNON,RELTYPE_SHANNON,ALTERTYPE_SHANNON,AGE35_SHANNON,AGE65_SHANNON,AGE_Q_SHANNON,AGE_10_SHANNON,SNI_TOT_SHANNON,SNI_2_SHANNON,EDU_COLLEGE_SHANNON,EDU_HS_SHANNON
count,4801.0,4801.0,4801.0,4801.0,4801.0,4941.0,4941.0,4341.0,4341.0,4231.0,4231.0
mean,0.422487,0.328289,0.703814,0.319497,0.002045,0.446544,0.621918,0.955972,0.00016,0.304714,0.083546
std,0.30839,0.301708,0.53283,0.313065,0.030141,0.410878,0.515921,0.787468,0.01052,0.318784,0.190189
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.636514,0.410116,0.693147,0.410116,0.0,0.562335,0.673012,1.039721,0.0,0.0,0.0
75%,0.682908,0.636514,1.098612,0.636514,0.0,0.693147,1.05492,1.609438,0.0,0.636514,0.0
max,0.693147,0.693147,2.106577,0.693147,0.693147,1.386294,1.735126,2.948553,0.693147,0.693147,0.693147


In [76]:
export_df = comb_df[['idr',
                     'AGE35',
                     'AGE65',
                     'AGE_Q',
                     'AGE_10',
                     'sharealterid',
                     'RELTYPE',
                     'FAM_TYPE',
                     'ALTERTYPE',
                     'SEX',
                     'AGE1',
                     'idtype',
                     'alteridtype',
                     'SPELLBEGIN',
                     'SPELLEND',
                     'APOE',
                     'EDUCG',
                     'CESD_TOT',
                     'CESD_16',
                     'CESD_20',
                     'CESD_LONELINESS',
                     'LONELY_3',
                     'SNI_TOT',
                     'SNI_2'
                    ]].merge(div_df,on='idr').merge(ratio_df,on='idr')

In [77]:
export_df.groupby('idr',as_index=False).first()

Unnamed: 0,idr,AGE35,AGE65,AGE_Q,AGE_10,sharealterid,RELTYPE,FAM_TYPE,ALTERTYPE,SEX,AGE1,idtype,alteridtype,SPELLBEGIN,SPELLEND,APOE,EDUCG,CESD_TOT,CESD_16,CESD_20,CESD_LONELINESS,LONELY_3,SNI_TOT,SNI_2,SEX_SHANNON,RELTYPE_SHANNON,ALTERTYPE_SHANNON,AGE35_SHANNON,AGE65_SHANNON,AGE_Q_SHANNON,AGE_10_SHANNON,SNI_TOT_SHANNON,SNI_2_SHANNON,EDU_COLLEGE_SHANNON,EDU_HS_SHANNON,FAMILIAL_RELATIVE_RATIO,APOE4_+_RATIO
0,2450200061,0,0,"(21.25, 37.5]",<=30,2450229337,NONRELATIVE,COWORKERNR,COWORKERNR,1.0,30.0,1.0,1.0,40.0,191.0,34.0,,16.0,>=16,<20,0.0,<3,29.0,>2,0.650818,0.348832,0.794939,0.601154,0.0,0.780264,1.213494,2.948553,0.0,0.684232,0.325083,0.111111,0.315789
1,2450200238,0,0,"(4.935, 21.25]",<=20,2450320467,RELATIVE,FATHER,FATHER,2.0,16.0,1.0,0.0,1.0,192.0,,,,,,,,,,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1.000000,0.000000
2,2450200299,0,0,"(21.25, 37.5]",<=30,2450340036,NONRELATIVE,SAMEADNREL,SAMEADNREL,1.0,26.0,1.0,1.0,35.0,137.0,23.0,,,,,,,,,0.000000,0.693147,0.693147,0.000000,0.0,0.693147,0.693147,0.693147,0.0,0.000000,0.000000,0.500000,0.500000
3,2450200493,0,0,"(21.25, 37.5]",<=30,2450203175,RELATIVE,BROTHER,BROTHER,2.0,26.0,1.0,1.0,1.0,457.0,,,,,,,,,,0.673012,0.000000,0.500402,0.000000,0.0,0.500402,1.054920,1.098612,0.0,0.000000,0.000000,1.000000,0.500000
4,2450200742,0,0,"(21.25, 37.5]",<=30,2450370277,RELATIVE,SISTER,SISTER,1.0,27.0,1.0,1.0,1.0,457.0,34.0,3.0,12.0,<16,<20,0.0,<3,27.0,>2,0.500402,0.673012,1.054920,0.673012,0.0,0.500402,0.950271,1.332179,0.0,0.000000,0.000000,0.600000,0.250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4936,2450949062,0,0,"(21.25, 37.5]",<=30,2450226957,RELATIVE,MOTHER,MOTHER,2.0,27.0,1.0,0.0,1.0,80.0,34.0,,8.0,<16,<20,0.0,<3,32.0,>2,0.673012,0.000000,1.054920,0.000000,0.0,0.500402,0.950271,1.386294,0.0,0.562335,0.562335,0.000000,0.200000
4937,2450949332,0,0,"(21.25, 37.5]",<=30,2450252749,RELATIVE,MOTHER,MOTHER,2.0,30.0,1.0,0.0,1.0,416.0,33.0,3.0,12.0,<16,<20,0.0,<3,27.0,>2,0.693147,0.562335,1.039721,0.562335,0.0,0.562335,0.000000,1.039721,0.0,0.562335,0.000000,0.250000,0.500000
4938,2450949473,1,0,"(37.5, 53.75]",<=60,2450233240,NONRELATIVE,N100MNREL,N100MNREL,2.0,51.0,1.0,1.0,28.0,129.0,33.0,1.0,14.0,<16,<20,0.0,<3,20.0,>2,0.661563,0.000000,0.562335,0.562335,0.0,1.073543,1.039721,1.791759,0.0,0.693147,0.000000,0.000000,0.125000
4939,2450949485,1,0,"(37.5, 53.75]",<=60,2450470004,RELATIVE,SPOUSE,SPOUSE,2.0,52.0,1.0,1.0,1.0,304.0,33.0,,,,,,,,,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000


In [78]:
#to share
export_df.to_csv('fhs_sn_diversity_wave1.csv')