In [1]:
import pandas as pd
import numpy as np

# Converting Religion Columns

In [2]:
# loading in data
drugs_2012 = pd.read_stata('Data/2012.dta', convert_categoricals=False) #2012 example

drugs_2012 = drugs_2012.set_index('CASEID')
drugs_2012.head()

Unnamed: 0_level_0,QUESTID2,CIGEVER,CIGOFRSM,CIGWILYR,CIGTRY,CIGYFU,CIGMFU,CIGREC,CIG30USE,CG30EST,...,IIEMPSTY,II2EMSTY,EMPSTAT4,IIEMPST4,II2EMST4,PDEN00,COUTYP2,ANALWT_C,VESTR,VEREP
CASEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,50886467,2,4,4,991,9991,91,91,91,91,...,1,1,99,9,9,2,2,1275.6,30054,2
2,13766883,2,99,99,991,9991,91,91,91,91,...,1,1,1,1,1,2,2,5191.07,30031,1
3,17772877,2,99,99,991,9991,91,91,91,91,...,1,1,1,1,1,3,3,419.74,30056,2
4,45622817,1,99,99,13,9999,99,2,93,93,...,1,1,2,1,1,2,2,1449.3,30054,1
5,17239390,1,99,99,11,9999,99,4,93,93,...,1,1,1,1,1,1,1,15344.29,30012,2


Columns for me to look through:

1. `SNRLGSVC`: PAST 12 MOS, HOW MANY RELIG. SERVICES
2. `SNRLGIMP`: MY RELIGIOUS BELIEFS ARE VERY IMPORTANT
3. `SNRLDCSN`: MY RELIGIOUS BELIEFS INFLUENCE MY DECISIONS
4. `SNRLFRND`: IT IS IMPORTANT THAT MY FRIENDS SHARE REL BELIEFS

In [3]:
# nested dictionary for the values we will convert each column to 
religion_dict = {'SNRLGSVC':
             {1:'0',
              2:'1-2',
              3:'3-5',
              4:'6-24',
              5:'25-52',
              6:'52+'},

             'SNRLGIMP':
             {1:'Strongly Disagree',
              2:'Disagree',
              3:'Agree',
              4:'Strongly Agree'},

             'SNRLDCSN':
              {1:'Strongly Disagree',
              2:'Disagree',
              3:'Agree',
              4:'Strongly Agree'},

             'SNRLFRND':
             {1:'Strongly Disagree',
              2:'Disagree',
              3:'Agree',
              4:'Strongly Agree'}
            }

In [4]:
def endcode_col(df, col):
    '''
    Encodes the column values of the dataframe to more readable description.
    
    param df: DataFrame
    param col: Column name
    return: None
    '''
    temp_dict = religion_dict[col]
    df[col] = df[col].apply(lambda value: temp_dict[value] if value != 99 else '99')

In [5]:
def convert_religion_cols(df):
    '''
    Converts the religion columns into a more readable description and return a new DataFrame.
    
    param df: DataFrame
    return: DataFrame with only religious columns
    '''
    religion_cols = ['SNRLGSVC','SNRLGIMP','SNRLDCSN','SNRLFRND'] # columns related to religion
    religion_df = df[religion_cols] # creating new df with only religion_cols 

    # converting bad/missing data to one value
    religion_df = religion_df.applymap(lambda x : 99 if x <= 99 and x >= 85 else x)
    
    # converting relgion column values 
    for col in religion_df.columns:
        endcode_col(religion_df, col)
        
    return religion_df.reset_index()

In [6]:
convert_religion_cols(drugs_2012)

Unnamed: 0,CASEID,SNRLGSVC,SNRLGIMP,SNRLDCSN,SNRLFRND
0,1,99,99,99,99
1,2,1-2,Agree,Agree,Disagree
2,3,52+,Strongly Agree,Strongly Agree,Disagree
3,4,6-24,Agree,Agree,Strongly Disagree
4,5,1-2,Strongly Disagree,Strongly Disagree,Strongly Disagree
...,...,...,...,...,...
55263,55264,0,Agree,Agree,Agree
55264,55265,99,99,99,99
55265,55266,0,99,99,99
55266,55267,6-24,Agree,Strongly Agree,Agree


## Converting all Data and Combining

In [7]:
# loading in data
drugs_2010 = pd.read_stata('Data/2010.dta', convert_categoricals=False).set_index('CASEID')
drugs_2011 = pd.read_stata('Data/2011.dta', convert_categoricals=False).set_index('CASEID')
drugs_2012 = pd.read_stata('Data/2012.dta', convert_categoricals=False).set_index('CASEID')
drugs_2013 = pd.read_stata('Data/2013.dta', convert_categoricals=False).set_index('CASEID')
drugs_2014 = pd.read_stata('Data/2014.dta', convert_categoricals=False).set_index('CASEID')

In [8]:
drugs_2010 = convert_religion_cols(drugs_2010)
drugs_2010['YEAR'] = 2010

drugs_2011 = convert_religion_cols(drugs_2011)
drugs_2011['YEAR'] = 2011

drugs_2012 = convert_religion_cols(drugs_2012)
drugs_2012['YEAR'] = 2012

drugs_2013 = convert_religion_cols(drugs_2013)
drugs_2013['YEAR'] = 2013

drugs_2014 = convert_religion_cols(drugs_2014)
drugs_2014['YEAR'] = 2014

In [9]:
all_drugs = pd.concat([drugs_2010,
                       drugs_2011,
                       drugs_2012,
                       drugs_2013,
                       drugs_2014],ignore_index=True)

In [10]:
print(all_drugs.shape)
all_drugs.head()

(281409, 6)


Unnamed: 0,CASEID,SNRLGSVC,SNRLGIMP,SNRLDCSN,SNRLFRND,YEAR
0,1,6-24,Agree,Agree,Agree,2010
1,2,3-5,Disagree,Disagree,Strongly Disagree,2010
2,3,99,99,99,99,2010
3,4,1-2,Agree,Agree,Disagree,2010
4,5,6-24,Strongly Agree,Strongly Agree,Agree,2010
