In [13]:
# created by Yu-Shin 20221111
# last adjust: 20221111
import os
import csv
import json
from google.colab import drive
import pandas as pd
import numpy as np
from glob import glob
import re
import time
import numexpr
import datetime as dtt
from datetime import datetime
import argparse
from pathlib import Path
# from utils import traid  # 自定義package

# args

In [14]:
def parse_args():
  parser = argparse.ArgumentParser(
      description='combine all first day and generate survival dataform'
  )
  parser.add_argument(
      '--ch_dir',
      type=str,
      default='/content/drive/MyDrive/fb_college/traid'
  )
  parser.add_argument(
      '--raw_dir',
      type=str,
      default='../raw/csv/'
  )
  parser.add_argument(
      '--dtype_file',
      type=str,
      default='./tmp/dtype.json'
  )
  parser.add_argument(
      '--parse_dates_file',
      type=str,
      default='./tmp/parse_dates.json'
  )
  parser.add_argument(
      '--time_range',
      type=list,
      help='research time range',
      default=['2012-09-30', '2015-06-30'],
  )
  parser.add_argument(
      '--res_filt',
      type=float,
      help='filt response rate',
      default=0.7,
  )
  parser.add_argument(
      '--tie_mode',
      type=str,
      help='tie type, if departclass then only analysis in class tie',
      default='departclass',
  )
  # --------------------------> raw data file
  parser.add_argument(
      '--survey_file',
      type=str,
      help='raw survey file',
      default='../raw/csv/survey.csv'
  )
  # --------------------------> preprocess file
  parser.add_argument(
      '--all_tie_file',
      type=str,
      default='./tmp/all_tie_departclass.csv',
  )
  parser.add_argument(
      '--egolist_file',
      type=str,
      help='research ego list, default is from the class reponse rate > 0.7',
      default='./tmp/egores0.7.csv'
  )
  parser.add_argument(
      '--egofirstime_file',
      type=str,
      help='firstime ego use face book',
      default='./tmp/ego_firstime.csv'
  )
  parser.add_argument(
      '--d1firstday_file',
      type=str,
      help='d1 first time',
      default='./tmp/d1firstday_departclass.csv'
  )
  parser.add_argument(
      '--d2d1ego_file',
      type=str,
      help='d2-d1 level contain their ego',
      default='./tmp/d2d1ego_departclass.csv'
  )
  parser.add_argument(
      '--d2firstday_file',
      type=str,
      help='d2-d1 level contain their ego',
      default='./tmp/d2firstday_departclass.csv'
  )
  parser.add_argument(
      '--output_dir',
      type=str,
      help='d2-d1 level contain their ego',
      default='./tmp/',
  )
  parser.add_argument(
      '--write_dta',
      type=bool,
      help='whether to write dta stata file or not',
      default=False,
  )
  args, unknown = parser.parse_known_args()
  return args
args = parse_args()

In [15]:
# set up working dir
os.chdir(args.ch_dir)
from utils import traid
# set up dataframe display format
traid.window(None, 30)
# read dtype and time columns file
dtype = json.loads(Path(args.dtype_file).read_text())
parse_dates = json.loads(Path(args.parse_dates_file).read_text())

# load file

In [16]:
# tie file
usecols = ['from_id_p', 'from_id', 'createdtime_date', 'tie_type']
parse_dates_col = [parse_dates[args.all_tie_file.split('/')[-1]]] # 取出檔名，查詢這個檔名的時間欄位名稱
tie_df = pd.read_csv(args.all_tie_file,
                     dtype=dtype,
                     parse_dates=parse_dates_col,
                     usecols=usecols)

# ego related file
usecols = ['Ownerfbid']
egolist_df = pd.read_csv(args.egolist_file, dtype=dtype, usecols=usecols)
# ego and their class id
usecols = ['OwnerFbid', 'departclass']
ego_classid_df = pd.read_csv(args.survey_file, dtype=dtype, usecols=usecols)
# ego first time
egofirstday_df = pd.read_csv(args.egofirstime_file, dtype=dtype)

# survey file
usecols = ['OwnerFbid', 'response']
survey_df = pd.read_csv(args.survey_file, dtype=dtype, usecols=usecols)

In [17]:
# tie_file preprocess: drop loop, na check, time range cut, filt response rate
def tie_preprocess(df, survey_df, tie_cols, res_filt):
  original_n = df.shape[0]
  before = 0
  after = 0
  # drop dulicated
  print('shape before drop duplicates', df.shape)
  before = df.shape[0]
  df = df.drop_duplicates(ignore_index=True)
  after = df.shape[0]
  print('shape after drop duplicates', df.shape,
        'drop', (before-after) / original_n, '%')

  # drop loop tie
  print('shape before drop loop', df.shape)
  before = df.shape[0]
  df = traid.drop_loop(df, tie_cols[0], tie_cols[1])
  after = df.shape[0]
  print('shape after drop loop', df.shape)

  # file response rate
  before = df.shape[0]
  df = df.merge(survey_df, how='right', left_on=tie_cols[0], right_on='OwnerFbid')
  df = df[df['response'] > res_filt]
  after = df.shape[0]
  print('shape after filt res:', df.shape,
        'drop', (before-after) / original_n, '%')
  
  # drop Na
  print(df.isnull().sum())
  # drop if tere have missing value
  before = df.shape[0]
  df = df.dropna(axis=0)
  after = df.shape[0]
  print('shape after drop na:', df.shape,
        'drop', (before-after) / original_n, '%')
  
  return df

# d1firstday: tie_df -> new_tie_df -> d1_df -> d1firstday_df

In [18]:
# preprocess tie file
new_tie_df = tie_preprocess(tie_df, survey_df, ['from_id', 'from_id_p'],
                             args.res_filt)

# check whether from_id is ego
d1_df = traid.is_ego(egolist_df, 'Ownerfbid',
                     new_tie_df, 'from_id', 'from_id_p')
# check whether from_id_p is ego, append tow dataframe
d1_df = d1_df.append(traid.is_ego(egolist_df, 'Ownerfbid',
                                  new_tie_df, 'from_id_p', 'from_id'))

shape before drop duplicates (683065, 4)
shape after drop duplicates (382170, 4) drop 0.44050712596897806 %
shape before drop loop (382170, 4)
shape after drop loop (382170, 4)
shape after filt res: (261607, 6) drop 0.17650296823874742 %
from_id_p           89
from_id             89
createdtime_date    89
tie_type            89
OwnerFbid            0
response             0
dtype: int64
shape after drop na: (261518, 6) drop 0.00013029506708731967 %


In [19]:
# 14sd1_df
d1_df['d1firstday'], d1_df['egod1_set'] = traid.setTie_min(d1_df, ['ego_id', 'd1_alterid'], 'createdtime_date', 'd1firstday')

In [20]:
# find d1 firstday
# count every ego's d1 num
d1_df['ego_d1num'] = d1_df.groupby(by=['ego_id']
                               )['d1_alterid'].transform('nunique')
# count every d1_alter have how many ego
d1_df['d1_egonum'] = d1_df.groupby(by=['d1_alterid']
                               )['ego_id'].transform('nunique')

# created d1 firstday
# creat d1firstday df
d1firstday_df = traid.keepmin(d1_df, 'createdtime_date', ['d1_alterid', 'ego_id'])

# d1firstday save file
d1firstday_df.to_csv(f'{args.output_dir}d1firstday_{args.tie_mode}.csv', index=False)

# d2d1ego

In [21]:
# compute every d1alter's ego and
# when they become those ego's d1alter(d1firstday)
d1_egolist_df = traid.id_list(d1firstday_df, 'ego_id', 'd1_alterid', 'd1firstday', 'd1firstdays', 'd1_egolist')
# filt dt, only keep those tie which at least one person is d1 alter
# if only analysis inclass tie, shouldn't filt anyone
filt_d1_tie_df = traid.filt_d1_tie(d1_egolist_df, new_tie_df)
filt_d1_tie_df = traid.fillna_list(filt_d1_tie_df,
                             ['d1_egolist_x', 'd1_egolist_y',
                             'd1firstdays_x', 'd1firstdays_y'])
print(new_tie_df.shape)
print(filt_d1_tie_df.shape)
filt_d1_tie_df = traid.d1_egolist_now(filt_d1_tie_df, 'd1_egolist_x', 'd1firstdays_x') # get from_id's egolist when tie happend
filt_d1_tie_df = traid.d1_egolist_now(filt_d1_tie_df, 'd1_egolist_y', 'd1firstdays_y') # get from_id_p's egolist when tie happend
# get from_id's ego and from_id_p's ego
# no over loop part
filt_d1_tie_df = traid.get_dif(filt_d1_tie_df,
                               ['d1_egolist_x_now', 'd1_egolist_y_now'],
                               ['from_ego', 'from_p_ego'])
d2d1ego_df = traid.to_egod1d2_df(filt_d1_tie_df,
                                 ['from_ego', 'from_p_ego'],
                                 ['from_id', 'from_id_p'],
                                 ['from_id_p', 'from_id'])
d2d1ego_df['d2_contactppl'] = traid.unique_ct(d2d1ego_df,
                       ['d2_alterid', 'ego_id'],
                       'd1_alterid',
                       'd2_contactppl')
d2d1ego_df['ego_d2num'] = traid.unique_ct(d2d1ego_df,
                                          ['ego_id'], 
                                          'd2_alterid',
                                          'ego_d2num')
d2d1ego_df['egod1_d2num'] = traid.unique_ct(d2d1ego_df,
                                            ['ego_id', 'd1_alterid'],
                                            'd2_alterid',
                                            'egod1_d2num')
# # add d1firstday col and write egod1d2csv
# add d1 firstday
d2d1ego_df = d2d1ego_df.merge(
    d1firstday_df[['d1_alterid', 'ego_id', 'd1firstday']],
    on=['d1_alterid', 'ego_id'], how='left')
print('d2d1ego_df.shape', d2d1ego_df.shape)

(261518, 6)
(261518, 10)
150580
273902
d2d1ego_df.shape (181669, 9)


In [22]:
# # add egofirstday
# d2d1ego_df = d2d1ego_df.merge(
#     egofirstday_df[['OwnerFbid', 'egofirstime']],
#     left_on=['ego_id'], right_on=['OwnerFbid'], how='right', indicator=True)
# print("before drop na:", d2d1ego_df.shape)
# d2d1ego_df = d2d1ego_df.dropna(axis=0)
# print("after drop na:", d2d1ego_df.shape)

In [23]:
if '_merge' in d2d1ego_df.columns:
  d2d1ego_df = d2d1ego_df.drop(labels=['_merge'], axis=1)
# add egofirstday
d2d1ego_df = d2d1ego_df.merge(
    egofirstday_df[['OwnerFbid', 'egofirstime']],
    left_on=['ego_id'], right_on=['OwnerFbid'], how='right', indicator=True)
print("before drop na:", d2d1ego_df.shape)
d2d1ego_df = d2d1ego_df.dropna(axis=0)
print("after drop na:", d2d1ego_df.shape)

before drop na: (170326, 12)
after drop na: (169215, 12)


# d2 firstday

In [24]:
d2firstday_df = traid.to_firstday(d2d1ego_df,
                                  'createdtime_date',
                                  'd2',
                                  ['ego_id', 'd2_alterid'])

# generate set(ego, d2), no order
d2firstday_df['egod2_set'] = d2firstday_df.apply( # set need to convert to str
    lambda x: frozenset([x['ego_id'], x['d2_alterid']]), axis=1).astype('str') 
    # otherwise can not successfully sort value
    # after generate set, it will automatically sort
    # ex (1, 2), (2, 1) will both become (1, 2)
# generate set(ego, d2) duplicates number
d2firstday_df['egod2_duplicate_num'] = d2firstday_df.groupby(['egod2_set'])['ego_id'].transform('size')

# ego -> d2 ; d2 -> ego ; 同一個pair的人在前面在前面keepmin的時候
# 可能會因為順序不同 firstday 有不同的值
# 所以只留下最小的那個最小的那個 firstday 當作真正的firstday
d2firstday_df['d2_firstday_min'] = d2firstday_df.groupby(['egod2_set'])['d2firstday'].transform('min')
d2firstday_df['d1_firstday_min'] = d2firstday_df.groupby(['egod2_set'])['d1firstday'].transform('min')
d2firstday_df = d2firstday_df.drop(columns=['d2firstday', 'd1firstday'], axis=1)
d2firstday_df.rename(columns={'d1_firstday_min':'d1firstday',
                              'd2_firstday_min':'d2firstday'}, inplace=True)
# keep columns and rename
d1_m_df = traid.merge_pre(d1firstday_df,
                          ['d1_alterid', 'ego_id', 'tie_type', 'd1is', 'd1firstday'],
                          {'d1is': 'trans_d2is',
                          'd1_alterid': 'd2_alterid',
                          'd1firstday': 'transferday',
                          'tie_type': 'transfer_tie_type'})
# add transfer or not
d2firstday_df = traid.transfer(d2firstday_df, d1_m_df)
# time cut, use args.time_range
d2firstday_df = traid.filt_date(d2firstday_df, 'd2firstday',
                                args.time_range[0], args.time_range[1])
d2firstday_df['eventover'] = traid.add_eventover(d2firstday_df,
                                                 args.time_range[1],
                                                 'transfer', 'transferday')
d2firstday_df['duration'] = traid.add_duration(d2firstday_df,
                                               'd2firstday', 'eventover',
                                               'duration', 'D')
# add ego firstime

In [27]:
d2firstday_df = d2firstday_df[d2firstday_df['eventover'] <= d2firstday_df['timecut']]

In [30]:
# write csv
d2firstday_df.sort_values(by=['egod2_set'], inplace=True)
if args.write_dta:
  dta_file = args.d2firstday_file.split('c')[0] + 'dta'
  d2firstday_df.to_stata(dta_file, write_index=False)
  print(f'write {dta_file}')
else:
  d2firstday_df.to_csv(args.d2firstday_file, index=False)
  print(f'write {args.d2firstday_file}')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


write ./tmp/d2firstday_departclass.csv


# d2d1ego_df

In [31]:
d2d1ego_df = d2d1ego_df.merge(d2firstday_df[['ego_id', 'd2_alterid', 'd2firstday', 'eventover']], on=['ego_id', 'd2_alterid'], how='left')
print('before drop na d2d1ego:', d2d1ego_df.shape)
d2d1ego_df = d2d1ego_df.dropna(axis=0)
print('after drop na d2d1ego:', d2d1ego_df.shape)
# 回頭檢查，排除掉那些在閉合事件以後才進入網絡的網絡的d1
d2d1ego_df = d2d1ego_df[d2d1ego_df['d1firstday'] < d2d1ego_df['eventover']]

before drop na d2d1ego: (169215, 14)
after drop na d2d1ego: (122118, 14)


In [32]:
# write csv file
# d2d1ego_df.duplicated(subset=['ego_id', 'd1_alterid', 'd2_alterid']).sum() # 0
# 此時的createdtime_date，是該ego-d1組合第一天接觸該d2的日期
if '_merge' in d2d1ego_df.columns:
  d2d1ego_df.drop(labels=['_merge'], axis=1)
if args.write_dta:
  dta_file = args.d2d1ego_file.split('c')[0] + 'dta'
  d2d1ego_df.to_stata(dta_file, write_index=False)
  print(f'write {dta_file}')
else:
  d2d1ego_df.to_csv(args.d2d1ego_file, index=False)
  print(f'write {args.d2d1ego_file}')

write ./tmp/d2d1ego_departclass.csv
