In [1]:
%matplotlib inline

from misc import *

from ipfn import *

import pandas as pd
import matplotlib.pyplot as plt


In [2]:
#Perform one hot encoding on the possible answers for any given question

def ohe_question(index, question):
    col_name = ' '.join(question.split(' ')[:4])
    index = pd.concat([index1,
                       pd.get_dummies(index1[question],
                                      prefix=col_name,
                                      prefix_sep='|')],
                      axis=1)
    
    return index


In [3]:
#Load the survey data

index1 = preprocess(pd.read_excel('data/RawData/IN10001.xls'))
index4 = preprocess(pd.read_excel('data/RawData/IN10004.xls'))
index5 = preprocess(pd.read_excel('data/RawData/IN10005.xls'))
index6 = preprocess(pd.read_excel('data/RawData/IN10006.xls'))
index7 = preprocess(pd.read_excel('data/RawData/IN10007.xls'))

#Create a new dataframe that only contains the demographic information that's included in each survey

common_columns = list(set(index1.columns).intersection(set(index7.columns)))

common_data = pd.concat([index1[common_columns],
                         index4[common_columns],
                         index5[common_columns],
                         index6[common_columns],
                         index7[common_columns]
                        ])

#Drop the columns that don't contain demographic information

common_data.drop(['ID', 'ADID IDFA', 'Time Started', 'Time Finished'], axis=1, inplace=True)

#Drop the columns where area is missing since the distributions for each demographic doesn't differ much

index1 = index1.loc[index1['Area'] != 'Unknown', :]

question = 'How many of your close friends and family are studying at, or have graduated from, a four year college?'
index1 = ohe_question(index1, question)


In [4]:
"""
Raking using the ipfn library requires the following steps demonstrated below:

1. The survey data must be grouped by the variables of interest to obtain the sample marginal frequencies.
2. The resulting marginal frequency column must be named "total" or ipfn will raise an error.
3. Marginal frequencies for each individual variable should be saved as a pandas series.
4. Joint frequencies for pairs of variables should be saved as a pandas series
5. Create a list of the pandas series that contain all of the joint and marginal frequencies.
6. Create a list of lists, with each list containing the dataframe column names corresponding to a joint/marginal frequency.
"""

#Group each survey by gender, race, and age range to obtain the sample marginal frequencies

freqs = pd.DataFrame(index1.groupby(['Gender',
                                     'Age',
                                     "How many of your|All of them",
                                     "How many of your|Don't know",
                                     "How many of your|Less than half",
                                     "How many of your|Most of them",
                                     "How many of your|None",
                                     "How many of your|Only a few"]).size())
freqs = freqs.reset_index()
freqs.columns = ['Gender',
                 'Age',
                 "How many of your|All of them",
                 "How many of your|Don't know",
                 "How many of your|Less than half",
                 "How many of your|Most of them",
                 "How many of your|None",
                 "How many of your|Only a few",
                 'total']
freqs


Unnamed: 0,Gender,Age,How many of your|All of them,How many of your|Don't know,How many of your|Less than half,How many of your|Most of them,How many of your|None,How many of your|Only a few,total
0,female,18 - 24,0,0,0,0,0,1,10
1,female,18 - 24,0,0,0,0,1,0,3
2,female,18 - 24,0,0,0,1,0,0,9
3,female,18 - 24,0,0,1,0,0,0,6
4,female,18 - 24,0,1,0,0,0,0,4
5,female,18 - 24,1,0,0,0,0,0,5
6,female,25 - 34,0,0,0,0,0,1,17
7,female,25 - 34,0,0,0,0,1,0,7
8,female,25 - 34,0,0,0,1,0,0,21
9,female,25 - 34,0,0,1,0,0,0,13


In [5]:
"""
Demographic data is taken from the U.S. Census Bureau's November 2016 edition of the 
Voting and Registration Supplement to the Current Population Survey:

https://www.census.gov/data/tables/time-series/demo/voting-and-registration/p20-580.html

All numbers taken from these files are expressed in thousands.
"""

#Load the marginal age + gender data, taken from:
#https://www2.census.gov/programs-surveys/cps/tables/p20/580/table01.xls


census_age_gender = pd.read_excel('demographics/table01.xls')
census_age_gender.loc[census_age_gender.iloc[:, 0].isnull()]


Unnamed: 0,Table with row headers in column A and column headers in rows 4 through 6.,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
3,,,,Total Citizen Population,Reported registered,,Reported not registered,,No response to registration 1,,Reported voted,,Reported did not vote,,No response to voting 2,,Reported registered,Reported voted
4,,,,,Number,Percent,Number,Percent,Number,Percent,Number,Percent,Number,Percent,Number,Percent,Percent,Percent
6,,18 to 24 years,29320,26913,14905,55.4,6650,24.7,5358,19.9,11560,43,10171,37.8,5182,19.3,50.8,39.4
7,,25 to 34 years,43794,38283,24682,64.5,7186,18.8,6415,16.8,20332,53.1,11902,31.1,6049,15.8,56.4,46.4
8,,35 to 44 years,39905,34327,23948,69.8,5280,15.4,5099,14.9,20662,60.2,8780,25.6,4885,14.2,60,51.8
9,,45 to 54 years,42259,38301,28001,73.1,4682,12.2,5619,14.7,25012,65.3,7829,20.4,5460,14.3,66.3,59.2
10,,55 to 64 years,41540,39242,29393,74.9,4381,11.2,5468,13.9,26657,67.9,7236,18.4,5350,13.6,70.8,64.2
11,,65 to 74 years,28832,27839,21908,78.7,2502,9,3429,12.3,20219,72.6,4239,15.2,3381,12.1,76,70.1
12,,75 years and over,19852,19154,14759,77.1,1941,10.1,2454,12.8,13095,68.4,3703,19.3,2356,12.3,74.3,66
13,,18 years,3999,3754,1606,42.8,1173,31.2,975,26,1293,34.4,1539,41,922,24.6,40.2,32.3


In [6]:
#Calculate the frequencies for gender, race, and age

gender_freq = pd.Series(data=[census_age_gender.iloc[149, 3], census_age_gender.iloc[77, 3]],
                        index=['female', 'male'])

age_freq = pd.Series(data=[census_age_gender.iloc[6, 3],
                       census_age_gender.iloc[7, 3],
                       census_age_gender.iloc[8, 3],
                       census_age_gender.iloc[9, 3],
                       census_age_gender.iloc[10:13, 3].sum()],
                     index=['18 - 24', '25 - 34', '35 - 44', '45 - 54', '> 54'])


In [7]:
#Group the sample data by age and gender

age_gender_freq = index1.groupby(['Age', 'Gender'])['ID'].count()

#Replace each marginal frequency cell with data taken from the Census Bureau data

age_gender_freq.loc['18 - 24'] = [census_age_gender.iloc[150, 3], census_age_gender.iloc[78, 3]]
age_gender_freq.loc['25 - 34'] = [census_age_gender.iloc[151, 3], census_age_gender.iloc[79, 3]]
age_gender_freq.loc['35 - 44'] = [census_age_gender.iloc[152, 3], census_age_gender.iloc[80, 3]]
age_gender_freq.loc['45 - 54'] = [census_age_gender.iloc[153, 3], census_age_gender.iloc[81, 3]]

#Combine the marginal frequencies for 55-64, 65-74, and >75 to form the >54 marginal frequencies

age_gender_freq.loc['> 54'] = [census_age_gender.iloc[154:157, 3].sum(),
                    census_age_gender.iloc[82:85, 3].sum()]


In [8]:
#Create a list of the marginal/joint frequencies obtained from the census data

aggregates = [gender_freq, age_freq, age_gender_freq]

#Create a list of column names that correspond to the frequency tables in the previous list

dimensions = [['Gender'], ['Age'], ['Age', 'Gender']]

#Perform raking on the demographic data up to the specified maximum number of iterations

IPF = ipfn.ipfn(freqs, aggregates, dimensions, max_iteration=5000)
df = IPF.iteration()

df.columns = ['Gender',
              'Age',
              "How many of your|All of them",
              "How many of your|Don't know",
              "How many of your|Less than half",
              "How many of your|Most of them",
              "How many of your|None",
              "How many of your|Only a few",
              'Population (in thousands)']
df.to_csv('raking_gender_age.csv', index=False)
print(df)


ipfn converged
     Gender     Age  How many of your|All of them  \
0   18 - 24  female                             0   
1   18 - 24  female                             0   
2   18 - 24  female                             0   
3   18 - 24  female                             0   
4   18 - 24  female                             0   
5   18 - 24  female                             1   
6   25 - 34  female                             0   
7   25 - 34  female                             0   
8   25 - 34  female                             0   
9   25 - 34  female                             0   
10  25 - 34  female                             0   
11  25 - 34  female                             1   
12  35 - 44  female                             0   
13  35 - 44  female                             0   
14  35 - 44  female                             0   
15  35 - 44  female                             0   
16  35 - 44  female                             0   
17  35 - 44  female            

  return self._getitem_tuple(key)
  return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  if self.run_code(code, result):
