In [1]:
# created by Yu-Shin 20221113
# last adjust: 20221113
import os
import csv
import json
from google.colab import drive
import pandas as pd
import numpy as np
from glob import glob
import re
import time
import numexpr
import datetime as dtt
from datetime import datetime
import argparse
from pathlib import Path
import os
import sys

# args

In [2]:
def parse_args():
  parser = argparse.ArgumentParser(
      description='combine all the variable and count mean, dummy, duration'
  )
  parser.add_argument(
      '--ch_dir',
      type=str,
      default='/content/drive/MyDrive/fb_college/traid'
  )
  # ----------------------------------> type json file
  parser.add_argument(
      '--dtype_file',
      type=str,
      default='./tmp/dtype.json'
  )
  parser.add_argument(
      '--parse_dates_file',
      type=str,
      default='./tmp/parse_dates.json'
  )
  parser.add_argument(
      '--varName2file',
      type=str,
      default='./tmp/varName2file.json'
  )
  # -----------------------> set model parameter
  parser.add_argument(
      '--time_range',
      type=list,
      help='research time range',
      default=['2012-09-30', '2015-06-30'],
  )
  parser.add_argument(
      '--res_filt',
      type=float,
      help='filt response rate',
      default=0.7,
  )
  parser.add_argument(
      '--tie_mode',
      type=str,
      help='tie type, if departclass then only analysis in class tie',
      default='departclass',
  )
  parser.add_argument(
      '--var_level',
      type=str,
      help='compute tie level, can chose ego, d1 or d2',
      default='d1',
  )
  # --------------------------> raw data file
  parser.add_argument(
      '--survey_file',
      type=str,
      help='raw survey file',
      default='../raw/csv/survey.csv'
  )
  # --------------------------> preprocess file
  parser.add_argument(
      '--all_tie_file',
      type=str,
      default='./tmp/all_tie_departclass.csv',
  )
  parser.add_argument(
      '--egolist_file',
      type=str,
      help='research ego list, default is from the class reponse rate > 0.7',
      default='./tmp/egores0.7.csv'
  )
  parser.add_argument(
      '--egofirstime_file',
      type=str,
      help='firstime ego use face book',
      default='./tmp/ego_firstime.csv'
  )
  parser.add_argument(
      '--d1firstday_file',
      type=str,
      help='d1 first time',
      default='./tmp/d1firstday_departclass.csv'
  )
  parser.add_argument(
      '--d2firstday_file',
      type=str,
      help='d1 first time',
      default='./tmp/d2firstday_departclass.csv'
  )
  parser.add_argument(
      '--d2d1ego_file',
      type=str,
      help='d2-d1 level contain their ego',
      default='./tmp/d2d1ego_departclass.csv'
  )
  # ------------------------> dir and coding mode
  parser.add_argument(
      '--output_dir',
      type=str,
      help='file output dir',
      default='./outcome/',
  )
  parser.add_argument(
      '--var_file_dir',
      type=str,
      help='var file dir',
      default='./tmp/',
  )
  parser.add_argument(
      '--isDebug',
      type=bool,
      help='if true, only read 100 line',
      default=False,
  )
  parser.add_argument(
      '--debug_n',
      type=int,
      help='if in debug mode read n rows',
      default=100,
  )
  parser.add_argument(
      '--write_dta',
      type=bool,
      help='whether to write dta stata file or not',
      default=False,
  )
  args, unknown = parser.parse_known_args()
  return args
args = parse_args()

In [3]:
# set up working dir
os.chdir(args.ch_dir)
# from utils import onlineVar # 自定義自定義package
# set up dataframe display format
# read dtype and time columns file
dtype = json.loads(Path(args.dtype_file).read_text())
parse_dates = json.loads(Path(args.parse_dates_file).read_text())
varName2file = json.loads(Path(args.varName2file).read_text())

# read file

In [4]:
def Path2fileName(filePath):
  return filePath.split('/')[-1]

In [5]:
# read d2
d2firstday_df = pd.read_csv(args.d2firstday_file,
                            parse_dates=parse_dates[Path2fileName(args.d2firstday_file)],
                            dtype=dtype,
                            #usecols=usecols,
                            )#, nrows=1000)

# survey file
usecols = ['OwnerFbid', 'response','gender', 'birthyear', 'extra']
survey_df = pd.read_csv(args.survey_file,
                        dtype=dtype,
                        usecols=usecols,
                        )              

# category 1 variable: ego survey data

In [6]:
def add_offline(dt, female_col, age_col, extra_col):
  # reverse the original order, high score of egoextra means extra
  dt['egoextra'] = dt[extra_col].astype('str').replace(
      {'1': '4',
       '1.5': '3.5',
       '2': '3',
       '3': '2',
       '3.5': '1.5',
       '4': '1'}).astype('float')
  # recode gender, 2 => 1, 1 => 0, female is 1, male is 0
  dt['egofemale'] = np.where(dt[female_col] == 2, 1, 0)
  # age = 2016 - birth year
  dt['egoage'] = 2016 - dt[age_col]
  # dt = dt[['OwnerFbid', 'egofemale', 'egoage', 'egoextra', 'res70up']]
  dt['egoeduyr'] = 16
  dt['ego_id'] = dt['OwnerFbid']
  return dt

ego_offline_df = add_offline(survey_df, 'gender', 'birthyear', 'extra')

In [7]:
def keepboth_merge(dt, m_dt, on_col):
  if '_merge' in dt.columns:
    dt = dt.drop(labels=['_merge'], axis=1)
  dt = dt.merge(m_dt, on=on_col, how='left', indicator=True)
  dt = dt[dt['_merge'] == 'both']
  dt = dt.drop(labels=['_merge'], axis=1)
  return dt
d2firstday_df = keepboth_merge(d2firstday_df, ego_offline_df, 'ego_id')

# category 4 variable: d2 online

## read file and add d1 ego var

In [8]:
alters = ['ego', 'd2']
alter_ids = ['ego_id', 'd2_alterid']
vars = ['com', 'tag', 'lcc']

# append var to d2 firstday df
for alter in alters:
  for var in vars:
    varName = f'{alter}_{var}'
    # in var df, we only use the alter id + var we need
    usecols = [alter for alter in alter_ids]
    usecols.append(varName)
    filePath = args.var_file_dir + varName2file[alter][var]['file_name']
    var_df = pd.read_csv(filePath,
                         dtype=dtype,
                         usecols=usecols,
                         )
    if '_merge' in d2firstday_df.columns:
      d2firstday_df = d2firstday_df.drop(labels='_merge')
    d2firstday_df = d2firstday_df.merge(var_df,
                              on=alter_ids,
                              )

In [9]:
# d2_comment1 d2_posttag1 d2_comment_mon d2_posttag_mon
def add_comtag_model(dt, com_col, tag_col, start, end, alter):
  # add dummy variable
  dummy_com_col = str(alter) + '_comment1'
  dummy_tag_col = str(alter) + '_posttag1'
  dt[dummy_com_col] = np.where(dt[com_col] > 0, 1, 0)
  dt[dummy_tag_col] = np.where(dt[tag_col] > 0, 1, 0)
  # add monthly average
  # count how many month away from start time to end time
  duration_col = str(alter) + 'varduration'
  dt[duration_col] = (dt[end] - dt[start]) / np.timedelta64(1, 'M')
  mon_com_col = str(alter) + '_comment_mon'
  mon_tag_col = str(alter) + '_posttag_mon'
  dt[mon_com_col] = dt[com_col] / dt[duration_col]
  dt[mon_tag_col] = dt[tag_col] / dt[duration_col]
  return dt

def add_fbday(df, start, end, fbday_col):
  df[fbday_col] = (df[end] - df[start]) / np.timedelta64(1, 'D')
  fbday_col100 = str(fbday_col) + str(100)
  df[fbday_col100] = df[fbday_col] / 100
  df['egoage_fb100'] = df['egoage'] * df[fbday_col100]
  return df

# d2 ego add comment and tag
d2firstday_df = add_comtag_model(d2firstday_df, 'ego_com', 'ego_tag', 'egofirstime', 'd2firstday', 'ego')
d2firstday_df = add_comtag_model(d2firstday_df, 'd2_com', 'd2_tag', 'd2firstday', 'eventover', 'd2')

d2firstday_df = add_fbday(d2firstday_df,
                          'egofirstime', 'd2firstday', 'ego_fbday')

# category 3 variable: d1 alter online

d1_tieego_mean100
d1_neighbor_mean_1 d1_neighbor_mean_2
d1_density_mean
d1_comment_mean1 d1_posttag_mean1 d1_comment_mon_mean d1_posttag_mon_mean



In [12]:
# read alter_dt
# usecols = ['ego_id' ,'d2_alterid', 'd1_alterid', 'd2firstday', 'createdtime_date']
d2d1ego_df = pd.read_csv(args.d2d1ego_file,
                         dtype=dtype,
                         parse_dates=parse_dates[Path2fileName(args.d2d1ego_file)])

In [13]:
def keepboth_merge(dt, m_dt, on_col):
  if '_merge' in dt.columns:
    dt = dt.drop(labels='_merge', axis=1)
    
  dt = dt.merge(m_dt, on=on_col, how='left', indicator=True)
  dt = dt[dt['_merge'] == 'both']
  dt = dt.drop(labels=['_merge'], axis=1)
  return dt

def add_d1var(file, dt, var_col, on_cols):
  parse_dates = ['d1firstday', 'createdtime_date']
  usecols = on_cols + [var_col]
  var_dt = pd.read_csv(file,
                       dtype=dtype, usecols=usecols)
  var_dt = var_dt.drop_duplicates(subset=on_cols)
  dt = keepboth_merge(dt, var_dt, on_cols)
  return dt

def add_d1tieego(df, start, end, d1tieego_col):
  df[d1tieego_col] = (df[end] - df[start]) / np.timedelta64(1, 'D')
  d1tieego_col100 = str(d1tieego_col) + str(100)
  df[d1tieego_col100] = df[d1tieego_col] / 100
  return df

d2d1ego_df = add_d1var('./tmp/d1_com.csv', d2d1ego_df,
                       'd1_com', ['ego_id', 'd1_alterid', 'd2_alterid'])
d2d1ego_df = add_d1var('./tmp/d1_tag.csv', d2d1ego_df,
                       'd1_tag', ['ego_id', 'd1_alterid', 'd2_alterid'])
d2d1ego_df = add_d1var('./tmp/d1_lcc.csv', d2d1ego_df,
                       'd1_lcc', ['ego_id', 'd1_alterid', 'd2_alterid'])

d2d1ego_df = add_comtag_model(d2d1ego_df, 'd1_com', 'd1_tag', 'd1firstday', 'eventover', 'd1')

d2d1ego_df = add_d1tieego(d2d1ego_df,
                          'd1firstday', 'createdtime_date', 'd1_tieego')

In [14]:
def add_d1neighbor(dt):
  dt = dt.rename(columns={'egod1_d2num': 'd1_neighbor_1'})
  dt['d1_neighbor_2'] = dt['d1_neighbor_1'] * dt['d1_neighbor_1']
  return dt
d2d1ego_df = add_d1neighbor(d2d1ego_df)

In [15]:
def drop_outliner(dt, target_cols, bound):
  for target_col in target_cols:
    before = dt.shape[0]
    print("shape before drop:", dt.shape)
    dt = dt[dt[target_col] < bound]
    print("shape after drop:", dt.shape)
    after = dt.shape[0]
    print('total drop rate', ((before-after)/before), '%')
  return dt
d2d1ego_df = drop_outliner(d2d1ego_df,
                    ['d1_comment_mon', 'd1_posttag_mon'],
                    1500)

shape before drop: (120290, 24)
shape after drop: (119287, 24)
total drop rate 0.008338182725081054 %
shape before drop: (119287, 24)
shape after drop: (119248, 24)
total drop rate 0.0003269425838523896 %


In [16]:
def add_d1var_mean(dt, target_cols, groupby_cols):
  mean_cols = []
  for col in target_cols:
    mean_col = col + '_mean' # mean column's name = former name_mean
    mean_cols.append(mean_col) # record mean columns
    # compute d1 variable's mean by target_col
    dt[mean_col] = dt.groupby(by=groupby_cols)[col].transform(np.mean)
  dt = dt.sort_values(by=groupby_cols) # sort by ego_id d2_alterid
  dt = dt[dt['d2_alterid'] != '0'] # drop if d2 id is missing
  dt = dt[groupby_cols + mean_cols] # only keep target mean columns
  dt = dt.drop_duplicates(subset=groupby_cols) # d2-d1-ego to d2-ego format
  return dt
add_d1var_df = add_d1var_mean(d2d1ego_df, 
                              ['d1_tieego100',
                               'd1_neighbor_1', 'd1_neighbor_2',
                               'd1_comment1', 'd1_comment_mon',
                               'd1_posttag1', 'd1_posttag_mon',
                               'd1_lcc'],
                        ['ego_id', 'd2_alterid'])

In [17]:
d2survival_df = d2firstday_df.merge(add_d1var_df,
                                    on=['ego_id', 'd2_alterid'],
                                    how='left',
                                    )

In [18]:
d2survival_df = d2survival_df.rename(columns={'ego_lcc': 'ego_density',
                                      'd1_lcc_mean': 'd1_density_mean',
                                      'd1_tieego100_mean': 'd1_tieego_mean100',
                                      'd1_neighbor_1_mean': 'd1_neighbor_mean_1',
                                      'd1_neighbor_2_mean': 'd1_neighbor_mean_2',
                                      'd1_comment1_mean': 'd1_comment_mean1',
                                      'd1_posttag1_mean': 'd1_posttag_mean1',
                                      'd2_lcc': 'd2_density'})

# outliner/異常處理（評論/tag每個月大於3000的剔除掉）

In [19]:
d2survival_df = drop_outliner(d2survival_df,
                    ['ego_comment_mon', 'ego_posttag_mon',
                     'd2_comment_mon', 'd2_posttag_mon'], 1500)

shape before drop: (22421, 55)
shape after drop: (22397, 55)
total drop rate 0.001070425047946122 %
shape before drop: (22397, 55)
shape after drop: (22392, 55)
total drop rate 0.00022324418448899406 %
shape before drop: (22392, 55)
shape after drop: (22103, 55)
total drop rate 0.012906395141121829 %
shape before drop: (22103, 55)
shape after drop: (22047, 55)
total drop rate 0.002533592724969461 %


In [20]:
def drop_negative(dt, target_cols):
  for target_col in target_cols:
    dt = dt[dt[target_col] >= 0]
  return dt

In [21]:
# 負的數字drop掉(通常是時間段出了問題，因為duration極小而產生的誤差)
d2survival_df = drop_negative(d2survival_df,
                    ['ego_comment_mon', 'ego_posttag_mon',
                     'd2_comment_mon', 'd2_posttag_mon',
                     'd1_posttag_mon_mean', 'd1_comment_mon_mean'])

# add other variable
trans_type

In [22]:
def recode_trans_d2is(trans_d2is):
  if trans_d2is == 'from_id':
    return 1
  elif trans_d2is == 'from_id_p':
    return 2
  elif trans_d2is == '0':
    return 0
def add_recode_col(dt, target_col, new_col, recode_fun):
  dt[new_col] = dt[target_col].fillna('0')
  dt[new_col] = dt[target_col].apply(recode_fun)
  dt[new_col] = dt[new_col].fillna(0)
  return dt

In [23]:
# convert string to numeric
d2survival_df['d2_density'] = pd.to_numeric(d2survival_df['d2_density'],errors='coerce')
d2survival_df['ego_density'] = pd.to_numeric(d2survival_df['ego_density'],errors='coerce')

In [24]:
d2survival_df = add_recode_col(d2survival_df, 'trans_d2is', 'trans_type', recode_trans_d2is)

In [25]:
d2survival_df.to_csv('./outcome/d2survival_1115.csv', index=False)
d2survival_df.to_stata('../traid/outcome/d2survival_inclass_1115.dta', write_index=False)