In [48]:
import re
import json
import PyPDF2

column_names = []
column_indices = []

PAGE_START = 21
PAGE_END = 150

PDF_PATH = './data/2015/cpsjul15.pdf'
JSON_PATH = './data/2015/cspjul15variables.json'

with open(JSON_PATH, 'r') as file:
    # read the json file - create a dictionary of the data
    variable_names = json.load(file)['variables'].keys()

# Remove some extra variables
variables = list(set(variable_names) - {"for", "in", "ucgid"})
print("The variables are:  ", variables)

# Open the PDF file
with open(PDF_PATH, 'rb') as file:
    # Read the PDF file
    pdf_reader = PyPDF2.PdfReader(file)

    for page_number in range(PAGE_START, PAGE_END):
        # get the page number
        page = pdf_reader.pages[page_number]

        # Extract the text from the page
        page_text = page.extract_text()

        # Print the contents of the page
        # print(page_text)

        # Regex to check if the line ends with a number
        regex = re.compile(r'\d{1,4}[ ]?-[ ]?\d{1,4}$')

        for line in page_text.split('\n'):
            line = line.strip()
            
            if len(line) == 0:
                continue

            # If the line contains one of the variable names
            num_variables_in_line = 0
            col_name = None
            col_index = None

            for variable in variables:
                # why space? because some variables are substrings of other variables
                if variable+' ' in line and regex.search(line):

                    # if the line contains the variable name followed by an equal sign, it is just a description.
                    if re.search(rf'{variable}[\s]*=', line):
                        continue

                    num_variables_in_line += 1
                    matching_string = regex.findall(line)[-1]

                    col_name = variable
                    col_index = matching_string
                
            if num_variables_in_line > 1:  print("More than one variable in line: " + line + " " + str([variable for variable in variables if variable+' ' in line]))
            elif col_name is not None:
                    column_names.append(col_name)
                    column_indices.append([x.strip() for x in col_index.split('-')])

The variables are:   ['HUBUSL4', 'PEDISDRS', 'HEFAMINC', 'PTIO2OCD', 'PRDTHSP', 'HEBUNSV2', 'HETENURE', 'PTDTRACE', 'PEAFWHN1', 'PXABSRSN', 'PTERNWA', 'PULAYCK1', 'PRFAMTYP', 'PRINUYER', 'PULKDK6', 'PXIO1OCD', 'PTERNH1C', 'HEOUTMOB', 'GTCBSAST', 'PXDWAVR', 'PRDTCOW2', 'HEPSCON6', 'PXERNH2', 'PXMJNUM', 'PRHRUSL', 'PESPOUSE', 'PRPTHRS', 'PRIOELG', 'PELAYLK', 'HEBUNDLE', 'PULAY', 'HENOOU7', 'PEVIDEO', 'PXLKLL1O', 'PEECOMME', 'PTHR', 'PXLAYLK', 'HENOOU2', 'PREMPHRS', 'PXDWLKO', 'HECYBA', 'PERRP', 'PEHRWANT', 'HENOHM3', 'PEIO2COW', 'PULKDK5', 'PELNMOM', 'PRMJOCC2', 'PEMARITL', 'HUBUSL1', 'GTCSA', 'PECOHAB', 'PTERNH2', 'PETVBOX', 'PEEDUCA', 'HEEVRHOM', 'HENOOU10', 'PXIO1COW', 'PXDISDRS', 'HELAPTOP', 'PEEDTRAI', 'HEPRINOH', 'PUBUSCK2', 'HEPSCON4', 'PESEX', 'PUJHCK5', 'HENOHM2', 'PUHRCK12', 'PUHRCK7', 'HENOOU3', 'HEOUTCK', 'PUBUS1', 'PXLKM1', 'HXTELAVL', 'PULKM4', 'PEHRACT2', 'HEBUNSV1', 'PUDWCK3', 'PRCIVLF', 'PUHROT1', 'PWPRMWGT', 'PUSLFPRX', 'PRWKSTAT', 'PXDISPHY', 'PEDWAVL', 'PELAPTOP', 'PU

In [49]:
column_index_mapping = dict(zip(column_names, column_indices))
column_index_mapping


{'HEHOUSUT': ['31', '32'],
 'HETELHHD': ['33', '34'],
 'HETELAVL': ['35', '36'],
 'HEPHONEO': ['37', '38'],
 'HUTYPEA': ['41', '42'],
 'HUTYPB': ['43', '44'],
 'HUTYPC': ['45', '46'],
 'HWHHWGT': ['47', '56'],
 'HRINTSTA': ['57', '58'],
 'HRHTYPE': ['61', '62'],
 'HRMIS': ['63', '64'],
 'HUPRSCNT': ['67', '68'],
 'HRLONGLK': ['69', '70'],
 'HRHHID2': ['71', '75'],
 'HUBUS': ['79', '80'],
 'HUBUSL2': ['83', '84'],
 'HUBUSL3': ['85', '86'],
 'HUBUSL4': ['87', '88'],
 'GESTFIPS': ['93', '94'],
 'GTCO': ['101', '103'],
 'GTCBSAST': ['104', '104'],
 'GTMETSTA': ['105', '105'],
 'GTINDVPC': ['106', '106'],
 'GTCBSASZ': ['107', '107'],
 'PERRP': ['118', '119'],
 'PEPARENT': ['120', '121'],
 'PRTAGE': ['122', '123'],
 'PRTFAGE': ['124', '124'],
 'PEMARITL': ['125', '126'],
 'PESPOUSE': ['127', '128'],
 'PEAFEVER': ['131', '132'],
 'PEAFNOW': ['135', '136'],
 'PRDTHSP': ['141', '142'],
 'PRFAMREL': ['153', '154'],
 'PRFAMTYP': ['155', '156'],
 'PEHSPNON': ['157', '158'],
 'PEFNTVTY': ['169', '1