In [11]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt

from ipfn import *


In [12]:
def preprocess(data):
    """
    Drop irrelevant columns and combine categories to match the suggested demographics.

    Suggested Demographics
    ----------------------
    Party: Democratic, Independent, Republican
    Gender: Male, Female
    Age: 18-24, 25-34, 35-44, 45-54, >54
    Race: White, Black, Hispanic, Other Race
    Education: No Bachelor, Bachelors
    """
    index = data.copy()
    index.drop(['Country'], axis=1, inplace=True)
    
    #Remove rows where the respondent is in the 14-17 age range
    
    index = index[index.loc[:, 'Age'] != '14 - 17']
    
    #Combine the hispanic and latino categories
    
    index['Race'] = index['Race'].str.replace('latino', 'hispanic')
    
    #Combine other races into the "other" category
        
    races = ['white', 'black', 'hispanic', 'other']
    index.loc[~index['Race'].isin(races), 'Race'] = 'other'
    
    #Seperate education levels into "bachelors" and "no bachelors"
    
    bachelors = ['university', 'postgraduate']
    no_bachelors = ['high_school', 'other', 'vocational_technical_college', 'middle_school']

    index.loc[index['Education'].isin(bachelors), 'Education'] = 'bachelors'
    index.loc[index['Education'].isin(no_bachelors), 'Education'] = 'no bachelors'
    
    #Combine political affiliations to compensate for respondents who claim to be independent but aren't
    
    republican = ['Strong Republican', 'Weak Republican', 'Lean Republican/Independent']
    democrat = ['Strong Democrat', 'Weak Democrat', 'Lean Democrat/Independent']
    
    index.loc[index['What is your political party affiliation?'].isin(republican), 
              'What is your political party affiliation?'] = 'Republican'
    index.loc[index['What is your political party affiliation?'].isin(democrat),
              'What is your political party affiliation?'] = 'Democrat'
    
    return index

def filter(data, party=None, gender=None, age=None, race=None, education=None):
    """
    Filter a survey dataframe based on demographics, ignoring columns that have been dropped.
    """
    results = data
    
    for index, col in enumerate(data.columns):
        if col == 'What is your political party affiliation?':
            party_col = index
        elif col == 'Gender':
            gender_col = index
        elif col == 'Age':
            age_col = index
        elif col == 'Race':
            race_col = index
        elif col == 'Education':
            edu_col = index
    
    if party != None and 'What is your political party affiliation?' in data.columns:
        results = results[results.iloc[:, party_col].str.find(party) > -1]
        
    if gender != None and 'Gender' in data.columns:
        results = results[results.iloc[:, gender_col].str.startswith(gender)]
        
    if age != None and 'Age' in data.columns:
        results = results[results.iloc[:, age_col].str.startswith(age)]
        
    if race != None and 'Race' in data.columns:
        results = results[results.iloc[:, race_col].str.startswith(race)]
        
    if education !=None and 'Education' in data.columns:
        results = results[results.iloc[:, edu_col].str.contains(education)]
        
    return results
    

In [13]:
#Load the survey data

index1 = preprocess(pd.read_excel('data/RawData/IN10001.xls'))
index4 = preprocess(pd.read_excel('data/RawData/IN10004.xls'))
index5 = preprocess(pd.read_excel('data/RawData/IN10005.xls'))
index6 = preprocess(pd.read_excel('data/RawData/IN10006.xls'))
index7 = preprocess(pd.read_excel('data/RawData/IN10007.xls'))

#Create a new dataframe that only contains the demographic information that's included in each survey

common_columns = list(set(index1.columns).intersection(set(index7.columns)))

common_data = pd.concat([index1[common_columns],
                         index4[common_columns],
                         index5[common_columns],
                         index6[common_columns],
                         index7[common_columns]
                        ])

#Drop the columns that don't contain demographic information

common_data.drop(['ID', 'ADID IDFA', 'Time Started', 'Time Finished'], axis=1, inplace=True)


In [14]:
"""
Raking using the ipfn library requires the following steps demonstrated below:

1. The survey data must be grouped by the variables of interest to obtain the sample marginal frequencies.
2. The resulting marginal frequency column must be named "total" or ipfn will raise an error.
3. Marginal frequencies for each individual variable should be saved as a pandas series.
4. Joint frequencies for pairs of variables should be saved as a pandas series
5. Create a list of the pandas series that contain all of the joint and marginal frequencies.
6. Create a list of lists, with each list containing the dataframe column names corresponding to a joint/marginal frequency.
"""

#Group each survey by gender, race, and age range to obtain the sample marginal frequencies

freqs = pd.DataFrame(index1.groupby(['Gender', 'Race', 'Age', 'Education']).size())
freqs = freqs.reset_index()
freqs.columns = ['Gender', 'Race', 'Age', 'Education', 'total']
freqs


Unnamed: 0,Gender,Race,Age,Education,total
0,female,black,18 - 24,no bachelors,2
1,female,black,25 - 34,bachelors,1
2,female,black,25 - 34,no bachelors,3
3,female,black,35 - 44,bachelors,1
4,female,black,35 - 44,no bachelors,1
5,female,hispanic,18 - 24,bachelors,2
6,female,hispanic,18 - 24,no bachelors,12
7,female,hispanic,25 - 34,bachelors,5
8,female,hispanic,25 - 34,no bachelors,8
9,female,hispanic,35 - 44,bachelors,2


In [15]:
"""
Demographic data is taken from the U.S. Census Bureau's November 2016 edition of the 
Voting and Registration Supplement to the Current Population Survey:

https://www.census.gov/data/tables/time-series/demo/voting-and-registration/p20-580.html

All numbers taken from these files are expressed in thousands.
"""

#Load the marginal age + gender data 

census_age_gender = pd.read_excel('demographics/table01.xls')
census_age_gender.loc[census_age_gender.iloc[:, 0].isnull()]


Unnamed: 0,Table with row headers in column A and column headers in rows 4 through 6.,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
3,,,,Total Citizen Population,Reported registered,,Reported not registered,,No response to registration 1,,Reported voted,,Reported did not vote,,No response to voting 2,,Reported registered,Reported voted
4,,,,,Number,Percent,Number,Percent,Number,Percent,Number,Percent,Number,Percent,Number,Percent,Percent,Percent
6,,18 to 24 years,29320,26913,14905,55.4,6650,24.7,5358,19.9,11560,43,10171,37.8,5182,19.3,50.8,39.4
7,,25 to 34 years,43794,38283,24682,64.5,7186,18.8,6415,16.8,20332,53.1,11902,31.1,6049,15.8,56.4,46.4
8,,35 to 44 years,39905,34327,23948,69.8,5280,15.4,5099,14.9,20662,60.2,8780,25.6,4885,14.2,60,51.8
9,,45 to 54 years,42259,38301,28001,73.1,4682,12.2,5619,14.7,25012,65.3,7829,20.4,5460,14.3,66.3,59.2
10,,55 to 64 years,41540,39242,29393,74.9,4381,11.2,5468,13.9,26657,67.9,7236,18.4,5350,13.6,70.8,64.2
11,,65 to 74 years,28832,27839,21908,78.7,2502,9,3429,12.3,20219,72.6,4239,15.2,3381,12.1,76,70.1
12,,75 years and over,19852,19154,14759,77.1,1941,10.1,2454,12.8,13095,68.4,3703,19.3,2356,12.3,74.3,66
13,,18 years,3999,3754,1606,42.8,1173,31.2,975,26,1293,34.4,1539,41,922,24.6,40.2,32.3


In [16]:
#Load the race data

census_race = pd.read_excel('demographics/table04b.xls')
census_race


Unnamed: 0,Table with row headers in column A and column headers in rows 4 through 5.,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13
0,"Table 4b. Reported Voting and Registration, b...",,,,,,,,,,,,,
1,(In thousands),,,,,,,,,,,,,
2,STATE,"Sex, Race and Hispanic-Origin",Total Population,Total Citizen Population,Registered,,,,,Voted,,,,
3,,,,,Total registered,Percent registered\n(Total),Margin of Error 1,Percent registered\n(Citizen),Margin of Error 1,Total voted,Percent voted\n(Total),Margin of Error 1,Percent voted\n(Citizen),Margin of Error 1
4,US,Total,245502,224059,157596,64.2,0.3,70.3,0.3,137537,56,0.3,61.4,0.3
5,,Male,118488,107554,73761,62.3,0.4,68.6,0.4,63801,53.8,0.4,59.3,0.5
6,,Female,127013,116505,83835,66,0.4,72,0.4,73735,58.1,0.4,63.3,0.4
7,,White alone,192129,177865,127463,66.3,0.3,71.7,0.3,111891,58.2,0.3,62.9,0.4
8,,White non-Hispanic alone,157395,154450,114151,72.5,0.3,73.9,0.3,100849,64.1,0.4,65.3,0.4
9,,Black alone,30608,28808,19984,65.3,1,69.4,1,17119,55.9,1.1,59.4,1.1


In [17]:
#Load the marginal race + gender data for each race

census_white = pd.read_excel('demographics/table02_3.xls')
census_black = pd.read_excel('demographics/table02_4.xls')
census_hispanic = pd.read_excel('demographics/table02_6.xls')
census_white


Unnamed: 0,Table with row headers in column A and column headers in rows 4 through 6.,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,"Table 2. Reported Voting and Registration, by...",,,,,,,,,,,,,,,,,
1,(In thousands),,,,,,,,,,,,,,,,,
2,Non-Hispanic White alone,,Total Population,US Citizen,,,,,,,,,,,,,Total Population,
3,,,,Total Citizen Population,Reported registered,,Reported not registered,,No response to registration 1,,Reported voted,,Reported did not vote,,No response to voting 2,,Reported registered,Reported voted
4,,,,,Number,Percent,Number,Percent,Number,Percent,Number,Percent,Number,Percent,Number,Percent,Percent,Percent
5,BOTH SEXES,Total 18 years and over,157395,154450,114151,73.9,19210,12.4,21089,13.7,100849,65.3,33310,21.6,20290,13.1,72.5,64.1
6,,18 to 24 years,16008,15734,9290,59,3522,22.4,2922,18.6,7334,46.6,5560,35.3,2841,18.1,58,45.8
7,,25 to 44 years,47385,46106,32474,70.4,6964,15.1,6667,14.5,27652,60,12171,26.4,6283,13.6,68.5,58.4
8,,45 to 64 years,56530,55556,42552,76.6,5755,10.4,7249,13,38650,69.6,9860,17.7,7046,12.7,75.3,68.4
9,,65 to 74 years,21897,21660,17607,81.3,1614,7.5,2439,11.3,16320,75.3,2935,13.5,2406,11.1,80.4,74.5


In [18]:
#Load the marginal education + gender data for each race

census_edu_gender = pd.read_excel('demographics/table05_1.xls')
census_edu_gender


Unnamed: 0,Table with row headers in column A and column headers in rows 4 through 6.,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,"Table 5. Reported Voting and Registration, by...",,,,,,,,,,,,,,,,,
1,(In thousands),,,,,,,,,,,,,,,,,
2,18 years and over,,Total Population,US Citizen,,,,,,,,,,,,,Total Population,
3,,,,Total Citizen Population,Reported registered,,Reported not registered,,No response to registration 1,,Reported voted,,Reported did not vote,,No response to voting 2,,Reported registered,Reported voted
4,,,,,Number,Percent,Number,Percent,Number,Percent,Number,Percent,Number,Percent,Number,Percent,Percent,Percent
5,Both Sexes,Total,245502,224059,157596,70.3,32622,14.6,33841,15.1,137537,61.4,53860,24,32662,14.6,64.2,56
6,,Less than 9th grade,9761,5643,2389,42.3,2081,36.9,1173,20.8,1788,31.7,2732,48.4,1123,19.9,24.5,18.3
7,,"9th to 12th grade, no diploma",17727,14715,6906,46.9,4862,33,2947,20,5202,35.3,6746,45.8,2767,18.8,39,29.3
8,,High school graduate,71322,65518,40983,62.6,13520,20.6,11015,16.8,33774,51.5,21365,32.6,10379,15.8,57.5,47.4
9,,Some college or associate degree,69935,66809,48845,73.1,8198,12.3,9766,14.6,42296,63.3,15057,22.5,9456,14.2,69.8,60.5


In [19]:
#Calculate the frequencies for gender, race, and age

gender_freq = pd.Series(data=[census_age_gender.iloc[149, 2], census_age_gender.iloc[77, 2]],
                        index=['female', 'male'])

race_freq = pd.Series(data=[census_race.iloc[8, 2],
                       census_race.iloc[4, 2] - census_race.iloc[[8, 9, 11], 2].sum(),
                       census_race.iloc[11, 2],
                       census_race.iloc[9, 2]],
                      index=['white', 'other', 'hispanic', 'black'])

age_freq = pd.Series(data=[census_age_gender.iloc[6, 2],
                       census_age_gender.iloc[7, 2],
                       census_age_gender.iloc[8, 2],
                       census_age_gender.iloc[9, 2],
                       census_age_gender.iloc[10:13, 2].sum()],
                     index=['18 - 24', '25 - 34', '35 - 44', '45 - 54', '> 54'])

edu_freq = pd.Series(data=[census_edu_gender.iloc[6:10, 2].sum(),
                           census_edu_gender.iloc[10:12, 2].sum()],
                     index=['bachelors', 'no bachelors'])


In [20]:
#Group the data by race and gender

race_gender_freq = index1.groupby(['Race', 'Gender'])['ID'].count()

#Replace each marginal frequency cell with data taken from the Census Bureau data

race_gender_freq['white'] = census_white.iloc[[11, 17], 2]
race_gender_freq['black'] = census_black.iloc[[11, 17], 2]
race_gender_freq['hispanic'] = census_hispanic.iloc[[11, 17], 2]

#Add the marginal frequencies from the white, black, and hispanic populations together

non_other_gender = census_hispanic.iloc[[11, 17], 2] + census_black.iloc[[11, 17], 2] + census_white.iloc[[11, 17], 2]
non_other_gender.index = ['female', 'male']

#Find the marginal frequencies for race "other" by subtracting the "non-other" marginal frequencies from the gender totals

race_gender_freq['other'] = gender_freq - non_other_gender


In [21]:
#Group the sample data by age and gender

age_gender_freq = index1.groupby(['Age', 'Gender'])['ID'].count()

#Replace each marginal frequency cell with data taken from the Census Bureau data

age_gender_freq.loc['18 - 24'] = [census_age_gender.iloc[150, 2], census_age_gender.iloc[78, 2]]
age_gender_freq.loc['25 - 34'] = [census_age_gender.iloc[151, 2], census_age_gender.iloc[79, 2]]
age_gender_freq.loc['35 - 44'] = [census_age_gender.iloc[152, 2], census_age_gender.iloc[80, 2]]
age_gender_freq.loc['45 - 54'] = [census_age_gender.iloc[153, 2], census_age_gender.iloc[81, 2]]

#Combine the marginal frequencies for 55-64, 65-74, and >75 to form the >54 marginal frequencies

age_gender_freq.loc['> 54'] = [census_age_gender.iloc[154:157, 2].sum(),
                    census_age_gender.iloc[82:85, 2].sum()]


In [26]:
#Group the sample data by education and gender

edu_gender_freq = index1.groupby(['Education', 'Gender'])['ID'].count()

#Combine the marginal frequencies for "Less than 9th grade" to "Some college or associate degree" to form the "no bachelors" marginal frequencies

edu_gender_freq.loc['bachelors'] = [census_edu_gender.iloc[24:26, 2].sum(), census_edu_gender.iloc[17:19, 2].sum()]

#Combine the marginal frequencies for "Bachelor's degree" and "Advanced degree" to form the "bachelors" marginal frequencies

edu_gender_freq.loc['no bachelors'] = [census_edu_gender.iloc[20:24, 2].sum(), census_edu_gender.iloc[13:17, 2].sum()]

edu_gender_freq

Education     Gender
bachelors     female    40414
              male      36343
no bachelors  female    86599
              male      82145
Name: ID, dtype: int64

In [28]:
#Create a list of the marginal/joint frequencies obtained from the census data

aggregates = [gender_freq, race_freq, age_freq, age_gender_freq, race_gender_freq, edu_gender_freq]

#Create a list of column names that correspond to the frequency tables in the previous list

dimensions = [['Gender'], ['Race'], ['Age'], ['Age', 'Gender'], ['Race', 'Gender'], ['Education', 'Gender']]

#Perform raking on the demographic data up to the specified maximum number of iterations

IPF = ipfn.ipfn(freqs, aggregates, dimensions, max_iteration=5000)
df = IPF.iteration()

df.to_csv('python_raking_race_gender_age_edu.csv', 
          columns=['Race', 'Gender', 'Education', 'Age', 'Population (in thousands)'],
          index=False)
print(df)


ipfn converged
       Education  Gender      Race      Age         total
0   no bachelors  female     black  18 - 24   3587.700581
1      bachelors  female     black  25 - 34   1600.627005
2   no bachelors  female     black  25 - 34   4424.826545
3      bachelors  female     black  35 - 44   2227.374783
4   no bachelors  female     black  35 - 44   2052.476318
5      bachelors  female  hispanic  18 - 24    716.957276
6   no bachelors  female  hispanic  18 - 24   3963.961095
7      bachelors  female  hispanic  25 - 34   1473.744123
8   no bachelors  female  hispanic  25 - 34   2172.835885
9      bachelors  female  hispanic  35 - 44    820.323657
10  no bachelors  female  hispanic  35 - 44   3023.640012
11  no bachelors  female  hispanic     > 54   7204.565805
12     bachelors  female     other  18 - 24    523.504801
13  no bachelors  female     other  18 - 24    482.398030
14     bachelors  female     other  25 - 34    430.436875
15  no bachelors  female     other  25 - 34    396.638006

  return self._getitem_tuple(key)
  return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  if self.run_code(code, result):
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  self.obj = self.obj.loc[:, cols]
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [29]:
crosstab = pd.crosstab(index=[df['Age'], df['Gender']], columns=df['Race'],
                       values=df['Population (in thousands)'], aggfunc='sum')
crosstab.to_csv('python_raking_crosstab.csv')