### Day 4
#### part A + B (general part)

In [1]:
import pandas as pd
import re

In [2]:
infile="04a_input.txt"

In [3]:
with open(infile) as f:
    textraw=f.read()

In [4]:
# print first few characters of the full text document
textraw[:100]

'iyr:2015 cid:189 ecl:oth byr:1947 hcl:#6c4ab1 eyr:2026\nhgt:174cm\npid:526744288\n\npid:688706448 iyr:20'

In [5]:
# split at empty lines, create a list: each element/item is one passport(or invalid doc)
textstructured=re.split("(\\n){2}", textraw)
textstructured[:10]

['iyr:2015 cid:189 ecl:oth byr:1947 hcl:#6c4ab1 eyr:2026\nhgt:174cm\npid:526744288',
 '\n',
 'pid:688706448 iyr:2017 hgt:162cm cid:174 ecl:grn byr:1943 hcl:#808e9e eyr:2025',
 '\n',
 'ecl:oth hcl:#733820 cid:124 pid:111220591\niyr:2019 eyr:2001\nbyr:1933 hgt:159in',
 '\n',
 'pid:812929897 hgt:159cm hcl:#fffffd byr:1942 iyr:2026 cid:291\necl:oth\neyr:2024',
 '\n',
 'cid:83 pid:524032739 iyr:2013 ecl:amb byr:1974\nhgt:191cm hcl:#ceb3a1 eyr:2028',
 '\n']

In [7]:
# replace new lines within an item with space
items=[]
for line in textstructured:
    if line!="\n":
        line=line.replace("\n"," ")
        items.append(line)

# remove accidental trailing space (in last item)
for idx,item in enumerate(items):
    if item[-1]==" ":
        items[idx]=item[:-1]

items[:5]

['iyr:2015 cid:189 ecl:oth byr:1947 hcl:#6c4ab1 eyr:2026 hgt:174cm pid:526744288',
 'pid:688706448 iyr:2017 hgt:162cm cid:174 ecl:grn byr:1943 hcl:#808e9e eyr:2025',
 'ecl:oth hcl:#733820 cid:124 pid:111220591 iyr:2019 eyr:2001 byr:1933 hgt:159in',
 'pid:812929897 hgt:159cm hcl:#fffffd byr:1942 iyr:2026 cid:291 ecl:oth eyr:2024',
 'cid:83 pid:524032739 iyr:2013 ecl:amb byr:1974 hgt:191cm hcl:#ceb3a1 eyr:2028']

#### part A

In [8]:
# required identifiers:
ids_req=["byr","iyr", "eyr", "hgt","hcl", "ecl" ,"pid"]
# optional identifier:
ids_opt=["cid"]

In [9]:
# check all document items
count_passport=0
count_valid=0 # passport of northpole

for item in items:
    is_valid=all([(descr in item) for descr in ids_req])
    if is_valid:
        count_valid+=1
        is_passport=all([(descr in item) for descr in ids_opt])
        if is_passport:
            count_passport+=1

In [10]:
print("passports:",count_passport,", valid (passport or northpole card):",count_valid)

passports: 135 , valid (passport or northpole card): 264


#### part B

In [11]:
def itemtodict(it):
    # converts a single item (string) into a dict with keywords+value
    itsplit=it.split(" ")
    itsplit=list(map(pairtotuple,itsplit))
    itsplit=dict(itsplit)
    return itsplit

def pairtotuple(string):
    return tuple(string.split(":"))


In [13]:
# convert items list (strings) into a list of dictionaries
items_w_dicts=[itemtodict(item) for item in items]

items_w_dicts[0]

{'iyr': '2015',
 'cid': '189',
 'ecl': 'oth',
 'byr': '1947',
 'hcl': '#6c4ab1',
 'eyr': '2026',
 'hgt': '174cm',
 'pid': '526744288'}

In [14]:
df=pd.DataFrame(items_w_dicts)
df

Unnamed: 0,iyr,cid,ecl,byr,hcl,eyr,hgt,pid
0,2015,189,oth,1947,#6c4ab1,2026,174cm,526744288
1,2017,174,grn,1943,#808e9e,2025,162cm,688706448
2,2019,124,oth,1933,#733820,2001,159in,111220591
3,2026,291,oth,1942,#fffffd,2024,159cm,812929897
4,2013,83,amb,1974,#ceb3a1,2028,191cm,524032739
...,...,...,...,...,...,...,...,...
290,2020,329,grn,1946,#a97842,2025,158cm,636649774
291,2020,97,blu,1951,#341e13,2023,161cm,461889565
292,2013,150,hzl,1980,#cfa07d,2029,168cm,492241189
293,2016,153,gry,1998,#733820,2024,150cm,401735295


In [15]:
# drop all the rows with missing values
df=df[ids_req].dropna()
len(df) # this is also the answer to A

264

#### conditions 
```
byr (Birth Year) - four digits; at least 1920 and at most 2002.
iyr (Issue Year) - four digits; at least 2010 and at most 2020.
eyr (Expiration Year) - four digits; at least 2020 and at most 2030.
hgt (Height) - a number followed by either cm or in:
If cm, the number must be at least 150 and at most 193.
If in, the number must be at least 59 and at most 76.
hcl (Hair Color) - a # followed by exactly six characters 0-9 or a-f.
ecl (Eye Color) - exactly one of: amb blu brn gry grn hzl oth.
pid (Passport ID) - a nine-digit number, including leading zeroes.
cid (Country ID) - ignored, missing or not.
```

In [16]:
# checks for every individual category

def check_yr(string, start, end):
    # check whether the string is a 4 digit number and whether it's between the start and end year (inclusive)
    # use for byr, iyr, eyr
    pattern=r"\d{4}"
    if re.fullmatch(pattern,string) is not None: #it's 4 digits
        nr=int(string)
        if nr>=start and nr<=end:
            return True
        else:
            return False
    else:
        return False
    
def check_hgt(string):
    # checks height. first checks if it's 2digit+"in" or 3digit+"cm", then checks the precise number
    ok=False
    pattern=r"(\d{2}in)|(\d{3}cm)"
    if re.fullmatch(pattern, string) is not None:
        if "cm" in string:
            nr=int(string[:3]) # we know already it's 3 digits
            if nr>=150 and nr<=193:
                ok=True
        else: # "in"
            nr=int(string[:2])
            if nr>=59 and nr<=76:
                ok=True
    return ok

def check_hcl(string):
    pattern=r"#[\da-f]{6}"
    if re.fullmatch(pattern, string) is not None:
        return True
    else: 
        return False
    
def check_ecl(string):
    pattern=r"amb|blu|brn|gry|grn|hzl|oth"
    if re.fullmatch(pattern, string) is not None:
        return True
    else: 
        return False
    
def check_pid(string):
    pattern=r"\d{9}"
    if re.fullmatch(pattern, string) is not None:
        return True
    else: 
        return False



In [18]:
from functools import partial

In [19]:
# required columns and corresponding condition
ids_req=["byr","iyr", "eyr", "hgt","hcl", "ecl" ,"pid"]

check_fcts=[partial(check_yr,start=1920,end=2002), # byr
            partial(check_yr,start=2010,end=2020), # iyr
            partial(check_yr,start=2020,end=2030), # eyr
            check_hgt,
            check_hcl,
            check_ecl,
            check_pid
           ]


In [27]:
# test all conditions
for idx in range(len(ids_req)):
    # a series with "true" in a row if the test was passed
    checked=df[ids_req[idx]].apply(check_fcts[idx])
    # combine all tests
    if idx==0:
        checked_all=checked
    else:
        checked_all=checked_all & checked


In [29]:
# final results for (B)
sum(checked_all)

224