In [1]:
from utils import *
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# User Tagging


## 1. Demographic Info

In [2]:
years = ['0304', '0506', '0708', '0910', '1112', '1314', '1516', '1718', '1720']
year_char = 'C'
type_demo = 'demographic'
df_demo = concat_data_across_years(type_demo, 'DEMO', years, year_char)
#df_demo['years'] = df_demo['years'].astype(object)

# Select the wanted columns. Make changes here if needed in the future.
df_demo = df_demo[['SEQN', 'RIAGENDR', 'RIDAGEYR', 'RIDRETH1', 'INDHHINC', 'DMDEDUC2', 'WTINT2YR',
                       'WTMEC2YR', 'WTINTPRP', 'WTMECPRP', 'years']]
df_demo = df_demo.fillna(0)
df_demo[['SEQN', 'RIAGENDR', 'RIDAGEYR', 'RIDRETH1', 'INDHHINC', 'DMDEDUC2']] = \
    df_demo[['SEQN', 'RIAGENDR', 'RIDAGEYR', 'RIDRETH1', 'INDHHINC', 'DMDEDUC2']].astype(int)

# Unify the weight columns and make column names readable.
df_demo['weight_interview'] = np.where(df_demo['WTINT2YR'] == -1, df_demo['WTINTPRP'], df_demo['WTINT2YR'])
df_demo['weight_mec'] = np.where(df_demo['WTMEC2YR'] == -1, df_demo['WTMECPRP'], df_demo['WTMEC2YR'])
df_demo.drop(['WTINT2YR','WTMEC2YR', 'WTINTPRP', 'WTMECPRP'], axis=1, inplace=True)
df_demo = df_demo.rename(columns={'RIAGENDR': 'gender', 'RIDAGEYR': 'age', 'RIDRETH1': 'race', 'DMDEDUC2': 'education',
                                  'INDHHINC': 'household_income'})
df_demo['SEQN'] = df_demo['SEQN'].astype(str)
df_demo = df_demo.set_index('SEQN')

# Drop the weight colummns for now.
df_demo.drop(['weight_interview', 'weight_mec'], axis=1, inplace=True)

# Transform ages to age groups
bins = [-1, 10, 20, 30, 40, 50, 60, 100]
labels = [1, 2, 3, 4, 5, 6, 7]

# Create a new column for age groups
df_demo['age_group'] = pd.cut(df_demo['age'], bins=bins, labels=labels, right=True).astype(int)

In [3]:
df_demo

Unnamed: 0_level_0,gender,age,race,household_income,education,years,age_group
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
21005,1,19,4,6,0,0304,2
21006,2,16,4,7,0,0304,2
21007,2,14,3,0,0,0304,2
21008,1,17,4,11,0,0304,2
21009,1,55,3,8,3,0304,6
...,...,...,...,...,...,...,...
124818,1,40,4,0,5,1720,4
124819,1,2,4,0,0,1720,1
124820,2,7,3,0,0,1720,1
124821,1,63,4,0,2,1720,7


In [4]:
# Generate all tags and initialize them with 0. Following merging will update the tags. 
column_tag = [
    'low_calorie', 'high_calorie', 'low_carb', 'low_protein', 'high_protein' , 'low_saturated_fat', 'low_sugar', 'low_cholesterol', 'high_fiber',
    'low_sodium', 'high_potassium', 'high_iron', 'high_calcium', 'high_vitamin_d', 'high_vitamin_c', 'low_phosphorus', 'high_folate_acid', 'high_vitamin_b12'
]
df_demo[column_tag] = 0

In [5]:
# There is no missing value in the dataset.
df_demo.isna().sum()

gender               0
age                  0
race                 0
household_income     0
education            0
years                0
age_group            0
low_calorie          0
high_calorie         0
low_carb             0
low_protein          0
high_protein         0
low_saturated_fat    0
low_sugar            0
low_cholesterol      0
high_fiber           0
low_sodium           0
high_potassium       0
high_iron            0
high_calcium         0
high_vitamin_d       0
high_vitamin_c       0
low_phosphorus       0
high_folate_acid     0
high_vitamin_b12     0
dtype: int64

## 2. Special Diet

In [6]:
type_dietary = 'dietary'

In [7]:
df_TOT1 = concat_data_across_years(type_dietary, 'DR1TOT', years, year_char)

In [8]:
columns_diet = ["DRQSDT1", "DRQSDT2", "DRQSDT3", "DRQSDT4", "DRQSDT7", "DRQSDT8", "DRQSDT9", "DRQSDT10", "DRQSDT12"]
mappings_diet = {
    "DRQSDT1": "Weight loss/Low calorie diet", "DRQSDT2": "Low fat/Low cholesterol diet",
    "DRQSDT3": "Low salt/Low sodium diet", "DRQSDT4": "Sugar free/Low sugar diet", "DRQSDT7": "Diabetic diet",
    "DRQSDT8": "Weight gain/Muscle building diet", "DRQSDT9": "Low carbohydrate diet", "DRQSDT10": "High protein diet",
    "DRQSDT12": "Renal/Kidney diet"
}

In [9]:
df_diet = df_TOT1[["SEQN"] + columns_diet]
df_diet = df_diet.fillna(0)
df_diet.rename(columns=mappings_diet, inplace=True)
df_diet = df_diet.astype(int)
df_diet['SEQN'] = df_diet['SEQN'].astype(str)
df_diet = df_diet.set_index('SEQN')
# Convert all integers to 1s if a user is taking a diet
df_diet = (df_diet != 0).astype(int)

In [10]:
# Check general statistics
df_diet.sum()

Weight loss/Low calorie diet        4659
Low fat/Low cholesterol diet        1170
Low salt/Low sodium diet            1011
Sugar free/Low sugar diet            413
Diabetic diet                       1361
Weight gain/Muscle building diet     270
Low carbohydrate diet                487
High protein diet                    146
Renal/Kidney diet                     59
dtype: int64

In [11]:
# Keep the records if there is at least one diet
df_diet = df_diet.loc[~(df_diet == 0).all(axis=1)]

In [12]:
# Some users are taking multiple diets. We need to carefully combine them when using. 
df_diet.sum(axis=1).value_counts()

1    7275
2     830
3     156
4      33
5       3
6       2
7       2
Name: count, dtype: int64

In [13]:
columns_diet = df_diet.columns
# Convert the diets to health tags 
df_diet['low_calorie'] = df_diet['Weight loss/Low calorie diet']
df_diet['low_saturated_fat'] = df_diet['Low fat/Low cholesterol diet']
df_diet['low_sodium'] = df_diet['Low salt/Low sodium diet']
df_diet['low_sugar'] = df_diet['Sugar free/Low sugar diet']
df_diet['low_sugar'] = df_diet['Diabetic diet'] | df_diet['low_sugar']
df_diet['high_fiber'] = df_diet['Diabetic diet']
df_diet['low_carb'] = df_diet['Low carbohydrate diet']
df_diet['high_protein'] = df_diet['High protein diet']
df_diet['low_protein'] = df_diet['Renal/Kidney diet']
df_diet['low_sodium'] = df_diet['Renal/Kidney diet'] | df_diet['low_sodium']
df_diet['low_phosphorus'] = df_diet['Renal/Kidney diet']
df_diet['high_calorie'] = df_diet['Weight gain/Muscle building diet']
df_diet['high_protein'] = df_diet['Weight gain/Muscle building diet'] | df_diet['high_protein']
df_diet.drop(columns_diet, axis=1, inplace=True)

In [14]:
# Combine the health tags for each user
df_diet = df_diet.groupby('SEQN').max()

In [15]:
df_diet.sum()

low_calorie          4659
low_saturated_fat    1170
low_sodium           1063
low_sugar            1729
high_fiber           1361
low_carb              487
high_protein          408
low_protein            59
low_phosphorus         59
high_calorie          270
dtype: int64

In [16]:
def merge_with_or(df1, df2):
    """
    Merges two DataFrames on a specified key(s) and combines shared columns using an 'OR' relation.
    
    Parameters:
    - df1, df2: DataFrames to be merged.
    
    Returns:
    - A merged DataFrame with combined shared columns using an 'OR' logic.
    """
    # Merge the DataFrames
    merged_df = pd.merge(df1, df2, left_index=True, right_index=True, how='left', suffixes=('_df1', '_df2'))
    
    # Find shared columns, excluding the key(s) used for merging
    shared_columns = set(df1.columns) & set(df2.columns)
    
    for col in shared_columns:
        col_df1 = f'{col}_df1'
        col_df2 = f'{col}_df2'

        # Apply 'OR' operation for the shared column and assign it to the merged DataFrame
        # Only df2 contains NaN values, so we need to fill them with False before converting to int
        merged_df[col] = (merged_df[col_df1] | merged_df[col_df2].fillna(False).astype(int))
        
        # Drop the original columns from the merge
        merged_df.drop(columns=[col_df1, col_df2], inplace=True)
    
    return merged_df

In [17]:
df_main = merge_with_or(df_demo, df_diet)

## 3. Medical Info

In [18]:
"""
User Health Status Tagging
"""

def tag_BMI_waist_circumference(row):
    underweight_BMI, overweight_BMI = 18.5, 25

    waist_threshold_male, waist_threshold_female = 102, 88
    waist_threshold = waist_threshold_male if row['gender'] == 1 else waist_threshold_female

    high_calories, low_calories = 0, 0
    if row['BMXBMI'] < underweight_BMI:
        high_calories = 1
    
    # In practice, either metrics can inidicate overweight. But for confidence, we use "and" here. 
    
    if row['BMXBMI'] >= overweight_BMI and row['BMXWAIST'] >= waist_threshold:
        low_calories = 1

    # this rarely happens, but it means the visceral fat is high, so the user still need low calories food.
    if high_calories == 1 and low_calories == 1:
        high_calories = 0

    return high_calories, low_calories


def tag_blood_pressure(row):
    high_systolic_threshold, high_diastolic_threshold = 130, 80
    low_sodium, high_potassium = 0, 0

    # Check if the blood pressure is above the thresholds
    if row['Average_Systolic'] >= high_systolic_threshold or row['Average_Diastolic'] >= high_diastolic_threshold:
        low_sodium = 1
        high_potassium = 1

    return low_sodium, high_potassium

### 3.1. BMI & Waist Circumference

In [19]:
type_table = 'examination'
df_BMI = concat_data_across_years(type_table, 'BMX', years, year_char)
df_BMI['SEQN'] = df_BMI['SEQN'].astype(int).astype(str)
df_BMI = df_BMI[['SEQN','BMXBMI', 'BMXWAIST']].copy()
df_BMI = df_BMI.merge(df_main, on='SEQN', how='left')
df_BMI[['high_calorie', 'low_calorie']] = df_BMI.apply(lambda row: tag_BMI_waist_circumference(row), axis=1, result_type='expand')
df_BMI.set_index('SEQN', inplace=True)


In [20]:
# Take a look here for now. Later we will only pay attention the adults.
df_BMI.loc[df_BMI['age'] > 18][['high_calorie', 'low_calorie']].sum()


high_calorie      899
low_calorie     26906
dtype: int64

In [21]:
df_main = merge_with_or(df_main, df_BMI[['high_calorie', 'low_calorie']])

### 3.2. Blood Pressure

In [22]:
years = ['0304', '0506', '0708', '0910', '1112', '1314', '1516', '1718']
df_BP = concat_data_across_years(type_table, 'BPX', years, year_char)
df_BP['Average_Systolic'] = df_BP[['BPXSY1', 'BPXSY2', 'BPXSY3', 'BPXSY4']].mean(axis=1, skipna=True)
df_BP['Average_Diastolic'] = df_BP[['BPXDI1', 'BPXDI2', 'BPXDI3', 'BPXDI4']].mean(axis=1, skipna=True)

df_BP_O = concat_data_across_years(type_table, 'BPXO', ['1720'], year_char)
df_BP_O['Average_Systolic'] = df_BP_O[['BPXOSY1', 'BPXOSY2', 'BPXOSY3']].mean(axis=1, skipna=True)
df_BP_O['Average_Diastolic'] = df_BP_O[['BPXODI1', 'BPXODI2', 'BPXODI3']].mean(axis=1, skipna=True)

df_BP_concat = pd.concat([df_BP[['SEQN', 'Average_Systolic', 'Average_Diastolic']], df_BP_O[['SEQN', 'Average_Systolic', 'Average_Diastolic']]])
df_BP_concat['SEQN'] = df_BP_concat['SEQN'].astype(int).astype(str)

df_BP_concat[['low_sodium', 'high_potassium']] = df_BP_concat.apply(tag_blood_pressure, axis=1, result_type='expand')
df_BP_concat.set_index('SEQN', inplace=True)

In [23]:
df_main = merge_with_or(df_main, df_BP_concat[['low_sodium', 'high_potassium']])

### 3.3. Low-Density Lipoprotein

In [24]:
type_table = 'laboratory'
year_char = 'D'
years = ['0506', '0708', '0910', '1112', '1314', '1516']
df_LDL = concat_data_across_years(type_table, 'TRIGLY', years, year_char)

years = ['1718', '1720']
year_char = 'J'
type_table = 'laboratory'
df_LDL_2 = concat_data_across_years(type_table, 'TRIGLY', years, year_char)

df_LDL_3 = concat_data_across_years(type_table, 'L13AM', ['0304'], 'C')

df_LDL = pd.concat([df_LDL[['SEQN', 'LBDLDLSI']], df_LDL_2[['SEQN', 'LBDLDLSI']], df_LDL_3[['SEQN', 'LBDLDLSI']]])
df_LDL['SEQN'] = df_LDL['SEQN'].astype(int).astype(str)

def tag_LDL(row):
    threshold = 3.3
    low_cholesterol, high_fiber, low_saturated_fat = 0, 0, 0
    if row['LBDLDLSI'] and row['LBDLDLSI'] > threshold:
        low_cholesterol, high_fiber, low_saturated_fat = 1, 1, 1

    return low_cholesterol, high_fiber, low_saturated_fat

df_LDL[['low_cholesterol', 'high_fiber', 'low_saturated_fat']] = df_LDL.apply(tag_LDL, axis=1, result_type='expand')
df_LDL.set_index('SEQN', inplace=True)

In [25]:
df_main = merge_with_or(df_main, df_LDL[['low_cholesterol', 'high_fiber', 'low_saturated_fat']])

### 3.4. Blood Urea Nitrogen

In [26]:
type_table = 'laboratory'
df_SBP = concat_data_across_years(type_table, 'BIOPRO',
                                 ['0506', '0708', '0910', '1112', '1314', '1516', '1718', '1720'], 'D')
df_temp = concat_data_across_years(type_table, 'L40',
                                 ['0304'], 'C')
df_SBP = pd.concat([df_SBP, df_temp])[['SEQN', 'LBDSBUSI']]
df_SBP['SEQN'] = df_SBP['SEQN'].astype(int).astype(str)

def tag_protein(row):
    return 1 if row['LBDSBUSI'] >= 7.1 else 0

df_SBP['low_protein'] = df_SBP.apply(tag_protein, axis=1, result_type='expand')
df_SBP.set_index('SEQN', inplace=True)

In [27]:
df_main = merge_with_or(df_main, df_SBP[['low_protein']])

### 3.5. Opioid Misuse

In [28]:
type_questionnaire = 'questionnaire'
# No specific illicit drug info in 03-04 data. No such table in 17-20 data.
df_DU = concat_data_across_years(type_questionnaire, 'DUQ',
                                 ['0506', '0708', '0910', '1112', '1314', '1516', '1718'], 'D')
df_DU = df_DU.loc[df_DU['DUQ290'] == 1]
df_DU = df_DU[['SEQN', 'DUQ270U', 'DUQ350U', 'DUQ300', 'DUQ310Q', 'DUQ310U']]
df_DU = df_DU.fillna(-1)
df_DU = df_DU.astype(int)
"""
# We only care about those who have used heroin at least once.
# If within a year, the user has been using any illicit drugs (heroin, meth, cocaine),
# we identify the user as an active user.
# Otherwise, we consider this user a recovered user.

# To be specific, -1 means missing, 4 means it has been years that a user haven't used a drug.
# If there is a value that is neither -1, nor 4 in any of the three columns, the user is an active user.

# We label the active user as 1 and recovered user as 2.
# And the rest of the users who hasn't even used heroin or other opioid prescription drugs as 0: non-opioid-user
"""
df_DU['active_user'] = np.where((df_DU['DUQ270U'].isin([4, -1]) == False) |
                                 (df_DU['DUQ350U'].isin([4, -1]) == False) |
                                 (df_DU['DUQ310U'].isin([4, -1]) == False), 1, 2)
df_DU = df_DU.rename(columns={'DUQ300': 'age_first_use_heroin', 'DUQ310U': 'last_time_unit_used_heroin',
                              'DUQ310Q': 'last_time_used_heroin', 'DUQ270U': 'last_time_unit_used_cocaine',
                              'DUQ350U': 'last_time_unit_used_meth'})
df_DU['SEQN'] = df_DU['SEQN'].astype(str)
df_DU = df_DU.set_index('SEQN')

In [29]:
years = ['0304', '0506', '0708', '0910', '1112', '1314', '1516', '1718', '1720']
year_char = 'C'
df_PM = concat_data_across_years(type_questionnaire, 'RXQ_RX', years, year_char)
df_PM = df_PM[df_PM['RXDUSE']==1]
df_PM_1 = df_PM.loc[(df_PM['RXDRSC1'] == 'F11.2') | (df_PM['RXDRSC1'] == 'F11.23')]

drugs = pd.read_sas('../data/RXQ_DRUG.xpt', encoding='ISO-8859-1')
drug_60 = drugs[(drugs['RXDDCI1A'] == 57) & (drugs['RXDDCI1B'] == 58) & (drugs['RXDDCI1C'] == 60)]
drug_191 = drugs[(drugs['RXDDCI1A'] == 57) & (drugs['RXDDCI1B'] == 58) & (drugs['RXDDCI1C'] == 191)]
drug = pd.concat([drug_60, drug_191])
drug_id = set(drug['RXDDRGID'].tolist())
df_PM  = df_PM [df_PM ['RXDDRGID'].isin(drug_id)]
df_PM = pd.concat([df_PM, df_PM_1])
df_PM = df_PM.drop_duplicates()

df_PM = df_PM[['SEQN', 'RXDDRUG', 'RXDDRGID', 'RXDDAYS']]
df_PM = df_PM[df_PM['RXDDAYS'] > 90]
df_PM[['SEQN', 'RXDDAYS']] = df_PM[['SEQN', 'RXDDAYS']].astype(int)
df_PM['SEQN'] = df_PM['SEQN'].astype(str)

"""
# We define the long term opioid users as those how have taken opioid prescriptions over 90 days.

# Note that each user can take multiple opioid prescriptions.

# In an earlier study for tracking long term opioid users,
# The author excluded medications containing buprenorphine since they are used to treat use disorder.
# However, we find multiple cases that this medicine used to treat opioid dependence.
# So technically this also implies the user is a long term opioid user.
"""
df_PM = df_PM.rename(columns={'RXDDRUG': 'drug_name', 'RXDDRGID': 'drug_id', 'RXDDAYS': 'days_using'})
df_PM = df_PM.set_index('SEQN')

In [30]:
# Create labels for opioid users
opioid_user_set = set(df_PM.index.tolist())
# Following the labeling scheme, we get the label in the opioid table. 0: user, 1: active user, 2: recovered_user
df_opioid = df_demo.copy()
df_opioid['opioid_label'] = 0
df_opioid['opioid_label'] = df_opioid.index.map(df_DU['active_user']).fillna(0)
df_opioid.loc[df_opioid.index.isin(opioid_user_set), 'opioid_label'] = 1

df_opioid['opioid_label'].loc[df_opioid['opioid_label'] == 2] = 0
df_opioid = df_opioid[df_opioid['opioid_label'] == 1]
df_opioid[['low_sugar', 'high_protein', 'high_fiber']] = 1


In [31]:
df_main = merge_with_or(df_main, df_opioid[['low_sugar', 'high_protein', 'high_fiber']])

### 3.6. Diabetes

In [32]:
type_table = 'laboratory'
df_glucose_1 = concat_data_across_years(type_table, 'BIOPRO',
                                ['0506', '0708', '0910', '1112', '1314', '1516', '1718', '1720'], 'D')
df_glucose_2 = concat_data_across_years(type_table, 'L40',
                                ['0304'], 'C')
df_glucose = pd.concat([df_glucose_1, df_glucose_2])[['SEQN', 'LBDSGLSI']]

df_ghb_1 = concat_data_across_years(type_table, 'GHB',
                                  ['0506', '0708', '0910', '1112', '1314', '1516', '1718', '1720'], 'D')
df_ghb_2 = concat_data_across_years(type_table, 'L10',
                                  ['0304'], 'C')
df_ghb = pd.concat([df_ghb_1, df_ghb_2])[['SEQN', 'LBXGH']]

df_diabete = df_glucose.merge(df_ghb, on='SEQN', how='outer')
df_diabete['SEQN'] = df_diabete['SEQN'].astype(int).astype(str)
df_diabete.set_index('SEQN', inplace=True)

def tag_diabetes(row):
    return 1 if (row['LBDSGLSI'] >= 7.0) & (row['LBXGH'] >= 6.5) else 0

df_diabete['low_sugar'] = df_diabete.apply(tag_diabetes, axis=1, result_type='expand')
df_diabete['high_fiber'] = df_diabete.apply(tag_diabetes, axis=1, result_type='expand')
df_main = merge_with_or(df_main, df_diabete[['low_sugar', 'high_fiber']])

### 3.7. Red Blood Cell Count & Hemoglobin

In [33]:
type_table = 'laboratory'
df_blood_1 = concat_data_across_years(type_table, 'CBC',
                                ['0506', '0708', '0910', '1112', '1314', '1516', '1718', '1720'], 'D')
df_blood_2 = concat_data_across_years(type_table, 'L25',
                                ['0304'], 'C')
df_blood = pd.concat([df_blood_1, df_blood_2])[['SEQN', 'LBXRBCSI', 'LBXHGB']]
df_blood['SEQN'] = df_blood['SEQN'].astype(int).astype(str)
df_blood.set_index('SEQN', inplace=True)
df_blood = df_blood.merge(df_demo, left_index=True, right_index=True, how='left')

def tag_RBC(row):
    low_threshold_male, low_threshold_female = 13.2, 11.6
    threshold = low_threshold_male if row['gender'] == 1 else low_threshold_female
    return 1 if (row['LBXRBCSI'] <= 4) & (row['LBXHGB'] < threshold) else 0

df_blood[['high_iron', 'high_vitamin_c', 'high_folate_acid', 'high_vitamin_b12']] = df_blood.apply(lambda row: [tag_RBC(row)] * 4, axis=1, result_type='expand')
df_main = merge_with_or(df_main, df_blood[['high_iron', 'high_vitamin_c','high_folate_acid', 'high_vitamin_b12']])

### 3.8. Osteoporosis

In [34]:
type_table = 'questionnaire'
# Note that we don't actually have data for 11-12 and 15-16. 
df_ost = concat_data_across_years(type_table, 'OSQ', ['0304', '0506', '0708', '0910', '1112', '1314', '1516', '1718', '1720'], 'C')
df_ost = df_ost.drop_duplicates()
df_ost['SEQN'] = df_ost['SEQN'].astype(int).astype(str)
df_ost = df_ost.set_index('SEQN')
df_ost = df_ost[df_ost['OSQ060'] == 1]
df_ost[['high_calcium', 'high_vitamin_d', 'high_vitamin_c']] = 1
df_main = merge_with_or(df_main, df_ost[['high_calcium', 'high_vitamin_d', 'high_vitamin_c']])

In [35]:
df_main['tag_num'] = df_main[column_tag].sum(axis=1)

In [36]:
df_main['tag_num'].value_counts()

tag_num
0     34798
1     28697
3     11063
2      6760
4      5301
5      3777
6      3482
7      1261
8       625
9       432
10      222
11       64
12       24
13       10
Name: count, dtype: int64

In [37]:
df_main

Unnamed: 0_level_0,gender,age,race,household_income,education,years,age_group,low_carb,low_phosphorus,low_calorie,...,high_protein,low_sugar,high_fiber,high_iron,high_folate_acid,high_vitamin_b12,high_calcium,high_vitamin_d,high_vitamin_c,tag_num
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000,2,21,4,0,4,1718,3,0,0,1,...,0,0,0,0,0,0,0,0,0,1
100001,1,39,3,0,4,1718,4,0,0,1,...,0,0,0,0,0,0,0,0,0,3
100002,2,9,4,0,0,1718,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100003,2,19,2,0,0,1718,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100004,2,65,2,0,1,1718,7,0,0,0,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2,14,4,0,0,1718,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99996,2,51,4,0,4,1718,6,0,0,1,...,0,1,1,0,0,0,0,0,0,5
99997,2,64,1,0,2,1718,7,0,0,1,...,0,0,0,1,1,1,0,0,1,6
99998,2,10,1,0,0,1718,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [38]:
for column in column_tag:
    df_main = df_main.rename(columns={column: 'user_'+column})

In [40]:
df_main.to_csv('../processed_data/user_tagging.csv')