### Social Network Analysis
##### Combine new variables and create diversity indices
##### Aug 30, 2021

### Notes as of 11/10/2021:
- just keep shannon

In [80]:
import os
import pandas as pd
import numpy as np
from math import log as ln
import networkx as nx
from pyvis.network import Network
import matplotlib.pyplot as plt

In [81]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_colwidth', None)

In [82]:
df = pd.read_sas('vr_sntwk_2008_m_0641s_v2_16.sas7bdat',format = 'sas7bdat', encoding="unicode_escape")
demo_df = pd.read_sas('vr_wkthru_ex09_1_1001s_16.sas7bdat',format = 'sas7bdat', encoding="unicode_escape")
apoe_df = pd.read_sas('coh_off_apoe_16.sas7bdat',format = 'sas7bdat', encoding="unicode_escape")
ex_df = pd.read_sas('ex1_7s_v2_16.sas7bdat',format = 'sas7bdat', encoding="unicode_escape",index='idr')
ed_df = pd.read_sas('vr_np_2018_a_1185s_19.sas7bdat',format = 'sas7bdat', encoding="unicode_escape")


  rslt[name] = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d")
  rslt[name] = self._string_chunk[js, :]


### Prepare dfs

In [83]:
ed_df['idr'] = ed_df['idr'].astype(int).astype(str)
ed_df = ed_df.groupby('idr',as_index=False).first()
ed_df['EDU_COLLEGE'] = ed_df['EDUCG'].apply(lambda x: 'Some college' if x>1 else 'No college')
ed_df['EDU_HS'] = ed_df['EDUCG'].apply(lambda x: 'HS grad' if x>=1 else 'No HS')

ed_df = ed_df[['idr','EDUCG','EDU_COLLEGE','EDU_HS']]

In [84]:
ed_df['EDU_COLLEGE'].value_counts()

Some college    4633
No college      2699
Name: EDU_COLLEGE, dtype: int64

In [85]:
ex_df = ex_df.iloc[:,446:509].reset_index()
ex_df['idr'] = ex_df['idr'].astype(int).astype(str)

In [86]:
apoe_df['idr'] = apoe_df['idr'].astype(int).astype(str)
apoe_df['APOE4'] = apoe_df['APOE'].apply(lambda x: '+' if x in [34,24, 44] else ('-' if x not in [34,24,44] else x))

In [87]:
age_cols = [x for x in demo_df.columns if "AGE" in x]
demo_df = demo_df[['idr','SEX']+age_cols]
demo_df['idr'] = demo_df['idr'].astype(int).astype(str)

### Select Wave 1

In [88]:
w1 = df[(df['idtype']==1) & 
    ((df['alteridtype']==0)&(df['SPELLBEGIN']<29) | (df['alteridtype']==1) & (df['SPELLBEGIN']<44)) &
    (df['SPELLEND']>44)].reset_index(drop=True)
w1[['idr','sharealterid']] = w1[['idr','sharealterid']].astype(int).astype(str)

In [89]:
w1['RELTYPE'] = w1['ALTERTYPE'].apply(lambda x: 'NONRELATIVE' if 'NR' in x else 'RELATIVE' )
w1_df = w1.groupby(['idr','sharealterid','RELTYPE'],as_index=False).first()

In [90]:
w1_df['FAMILIAL']=w1_df['RELTYPE']

In [None]:
w1_df['DISTMI1']

#### Lots of duplicates, probably from ego/alter ties fulfilling the criteria for multiple categories. Drop duplicates, only keeping first entry for now
#### TO DO: should keep closest tie category 

In [91]:
comb_df = w1_df.merge(demo_df,on='idr',how='left').merge(ex_df,on='idr',how='left').merge(ed_df,on='idr',how='left').merge(apoe_df.drop('idtype',axis=1),on='idr',how='left')

In [92]:
comb_df['AGE35']= comb_df['AGE1'].apply(lambda x: 0 if x<35 else 1)
comb_df['AGE65']= comb_df['AGE1'].apply(lambda x: 0 if x<65 else 1)
comb_df['AGE_Q']= pd.cut(comb_df['AGE1'],4)
comb_df['AGE_10']= pd.cut(comb_df['AGE1'],range(0,110,10),labels=['<=10','<=20','<=30','<=40','<=50',
                                                              '<=60','<=70','<=80','<=90','<=100'])

CES-D: G587-G606

Loneliness : G600

SNI: G645-G657

In [93]:
comb_df['CESD_TOT'] = comb_df.loc[:,'G587':'G606'].apply(lambda x: sum(x),axis=1)
comb_df['CESD_16'] = comb_df['CESD_TOT'].apply(lambda x: '>=16' if x>=16 else ('<16' if x<16 else x))
comb_df['CESD_20'] = comb_df['CESD_TOT'].apply(lambda x: '>=20' if x>=20 else ('<20' if x<20 else x))

In [94]:
comb_df['CESD_LONELINESS'] = comb_df['G600']
comb_df['LONELY_3'] = comb_df['CESD_LONELINESS'].apply(lambda x: '>=3' if x>=2 else ('<3' if x<3 else x))

In [95]:
comb_df['SNI_TOT'] = comb_df.loc[:,'G645':'G657'].apply(lambda x: sum(x),axis=1)
comb_df['SNI_2'] = comb_df['SNI_TOT'].apply(lambda x: '>2' if x>2 else ('<=2' if x<=2 else x))

In [None]:
### CHOSE DISTANCE CATEGORY

In [125]:
comb_df['DIST1_5'] = comb_df['DISTMI1'].apply(lambda x: 'NEAR' if x<= 5 else 'Far')
comb_df['DIST7_5'] = comb_df['DISTMI7'].apply(lambda x: 'NEAR' if x<= 5 else 'Far')

In [126]:
# [(c,comb_df[c].isna().sum()) for c in comb_df.columns if comb_df[c].isna().sum()>0]

### Create Diversity Index

In [127]:
def create_ratio(df,attr,val):
    ratio_list = []
    ego_list = df['idr'].unique()

    for e in ego_list:
        ego_df = df[df['idr']==e]
        alter_list = ego_df['sharealterid'].unique()  
        net_df = df[df['idr'].isin(alter_list)].groupby(['idr'],as_index=False).first()
        div_dict = dict(net_df[attr].value_counts())
        
        if val in div_dict:
            ratio = float(div_dict[val]/sum(div_dict.values()))
            ratio_list.append(ratio)
        else:
            ratio_list.append(0)
            
    res_df = pd.DataFrame({'idr':ego_list,(attr+'_'+str(val)+'_RATIO'):ratio_list})

    return res_df
        

In [None]:
### NEED TO ADD DISTANCE info
ratio_df = pd.DataFrame({'idr':comb_df['idr'].unique()})
for attr,val in [('FAMILIAL','RELATIVE'),('APOE4','+'),('DIST1_5','NEAR'),('DIST7_5','NEAR')]:
    temp = create_ratio(comb_df,attr,val)
    ratio_df = ratio_df.merge(temp,on='idr',how='left')

In [None]:
ratio_df

In [None]:
#individual network indices
#TO DO : get rid of for loop

def create_diversity_index(df,attr):
    shannon_list = []
#     simpson_list = []
    ego_list = df['idr'].unique()
    
    for e in ego_list:
        #select network
        ego_df = df[df['idr']==e]
        alter_list = ego_df['sharealterid'].unique()
        #get network info using alters 'ego' attributes
        net_df = df[df['idr'].isin(alter_list)].groupby(['idr'],as_index=False).first()
        div_dict = dict(net_df[attr].value_counts())

        #NA if no tie info
        if div_dict:
        
        #shannon index
            shannon_h = 0
            for cat in div_dict:
                p = float(div_dict[cat]/sum(div_dict.values()))
                temp = float(p*(np.log(p)))
                if np.isfinite(temp):
                    shannon_h += temp      
            if shannon_h < 0:
                shannon_h = -(shannon_h)
            shannon_list.append(shannon_h)
        
        
        #simpson index
#             simpson_d = 0
#             for cat in div_dict:
#                 p = float(div_dict[cat]/sum(div_dict.values()))
#                 temp = float(p**2)
#                 simpson_d+= temp
#             simpson_d = 1 - simpson_d
#             simpson_list.append(simpson_d)

        
        else:
            shannon_list.append(np.nan)
#             simpson_list.append(np.nan)

#     new_df = pd.DataFrame({'idr':ego_list,(attr+'_SHANNON'):shannon_list,(attr+'_SIMPSON'):simpson_list})
    new_df = pd.DataFrame({'idr':ego_list,(attr+'_SHANNON'):shannon_list})

    return new_df
    

In [None]:
div_df= pd.DataFrame({'idr':comb_df['idr'].unique()})
for col in ['SEX','RELTYPE','ALTERTYPE','AGE35','AGE65','AGE_Q','AGE_10','SNI_TOT','SNI_2','EDU_COLLEGE','EDU_HS']:
    temp = create_diversity_index(comb_df,col)
    div_df = div_df.merge(temp,on='idr',how='left')

In [None]:
## TO DO: check if these egos really dont have alters
div_df.isna().sum()

In [None]:
div_df.shape

In [None]:
div_df.describe()

In [None]:
export_df = comb_df[['idr',
                     'AGE35',
                     'AGE65',
                     'AGE_Q',
                     'AGE_10',
                     'sharealterid',
                     'RELTYPE',
                     'FAMILIAL',
                     'ALTERTYPE',
                     'SEX',
                     'AGE1',
                     'idtype',
                     'alteridtype',
                     'SPELLBEGIN',
                     'SPELLEND',
                     'APOE',
                     'EDUCG',
                     'CESD_TOT',
                     'CESD_16',
                     'CESD_20',
                     'CESD_LONELINESS',
                     'LONELY_3',
                     'SNI_TOT',
                     'SNI_2',
                     'DIST1_5',
                     'DIST7_5'
                    ]].merge(div_df,on='idr').merge(ratio_df,on='idr')

In [None]:
export_df.groupby('idr',as_index=False).first()

In [None]:
#to share
export_df.to_csv('fhs_sn_diversity_wave1.csv')