In [None]:
# dict_dp = dp[0]
# dict_dp['age'] = age_teller(dict_dp['birthdate'])

In [1]:
# Import necessary modules
from datetime import date
from time import perf_counter
from datetime import datetime
from collections import namedtuple
import random
from collections import Counter
from faker import Faker
from decimal import Decimal
from numpy import mean

In [2]:
def age_teller(birthDate: 'date') -> int:
    '''
    The function takes birth date in date format i.e. date(2000, 11, 4) and returns the age in int format.
    Obviously, a crude approximation -- finding total number of days in between today and birth date and dividing
    it by 365.24. We can remove the int to return the age in float too.
    :param birthDate: in date format
    :return: int
    '''
    num_days = 365.24
    if isinstance(birthDate, date):
        age = int((date.today() - birthDate).days / num_days)
        return age
    else:
        return f'ValueError: Enter birthDate in date format, eg. date(2000, 11, 4)'

In [3]:
# Faker() has an in-built random generator and hence its just enough to generate profiles
# using an iterator. Here seed() function is used to generate repeatable instances of profiles.

Faker.seed(10)
myFaker = Faker()
sample_size = 10000
dp = [myFaker.profile() for i in range(sample_size)]

In [None]:
dp[0] # just for checking the structure / type of data, and before adding 'age' key

In [4]:
for item in dp:
    if 'age' not in item.keys():
        item['age'] = age_teller(item['birthdate'])

In [None]:
dp[0] # updates the earlier dp by adding (key, value) pair using (age, age_teller(birthDate))

In [5]:
# namedtuple operation -- Average Age
# Find Average age among profiles

Age = namedtuple('Age', 'age')
total = 0
num_ops = 1000
for i in range(num_ops):
    start = perf_counter()
    ageList = [Age(item['age']) for item in dp]
    x = Age(*map(mean, zip(*ageList)))
    total += perf_counter() - start
aveAge_ntuple_time = total/num_ops
print(f'Average Age for a set of {sample_size} random profiles is {x.age}')
print(f'and the average execution time is {aveAge_ntuple_time} secs')

# Another method to get the average age using namedtuple's name operator, i.e. x.age
ave_age = mean(list(map(lambda x: x.age, ageList)))
print(f'Average Age by name operator is {ave_age}')

Average Age for a set of 10000 random profiles is 57.2926
and the average execution time is 0.009224206367005536 secs
Average Age by name operator is 57.2926


In [6]:
help(Age) # Before adding docstrings and the output of this cell is kept hidden due to large output content

Help on class Age in module __main__:

class Age(builtins.tuple)
 |  Age(age)
 |  
 |  Age(age,)
 |  
 |  Method resolution order:
 |      Age
 |      builtins.tuple
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __getnewargs__(self)
 |      Return self as a plain tuple.  Used by copy and pickle.
 |  
 |  __repr__(self)
 |      Return a nicely formatted representation string
 |  
 |  _asdict(self)
 |      Return a new OrderedDict which maps field names to their values.
 |  
 |  _replace(_self, **kwds)
 |      Return a new Age object replacing specified fields with new values
 |  
 |  ----------------------------------------------------------------------
 |  Class methods defined here:
 |  
 |  _make(iterable) from builtins.type
 |      Make a new Age object from a sequence or iterable
 |  
 |  ----------------------------------------------------------------------
 |  Static methods defined here:
 |  
 |  __new__(_cls, age)
 |      Create new instance of Age(age,)
 |  


In [58]:
Age.__doc__ = '''\
Age is a namedtuple with a single field variable "age"'''

In [59]:
Age.age.__doc__ = '''\
"age" is a field variable for the namedtuple "Age", and "age" is not present in Faker(). 
It is added as a last (key, value) pair using age_teller function in this notebook'''

In [9]:
help(Age) # This help shows after adding docstrings to both the namedtuple and its field variables
# Again the output fot this cell is hidden due to large content.

Help on class Age in module __main__:

class Age(builtins.tuple)
 |  Age(age)
 |  
 |  Age is a namedtuple with a single field variable "age"
 |  
 |  Method resolution order:
 |      Age
 |      builtins.tuple
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __getnewargs__(self)
 |      Return self as a plain tuple.  Used by copy and pickle.
 |  
 |  __repr__(self)
 |      Return a nicely formatted representation string
 |  
 |  _asdict(self)
 |      Return a new OrderedDict which maps field names to their values.
 |  
 |  _replace(_self, **kwds)
 |      Return a new Age object replacing specified fields with new values
 |  
 |  ----------------------------------------------------------------------
 |  Class methods defined here:
 |  
 |  _make(iterable) from builtins.type
 |      Make a new Age object from a sequence or iterable
 |  
 |  ----------------------------------------------------------------------
 |  Static methods defined here:
 |  
 |  __new__(_cls, age)
 

In [10]:
# namedtuple operation -- Oldest Person's Age
# find the maximum age among profiles

# Age = namedtuple('Age', 'age') # We can remove this -- already defined above

total = 0
num_ops = 1000
for i in range(num_ops):
    start = perf_counter()
    ageList = [Age(item['age']) for item in dp]
    x = max(list(map(lambda x: x.age, ageList)))
    total += perf_counter() - start
maxAge_ntuple_time = total/num_ops
print(f'Oldest Person\'s\' age for a set of {sample_size} random profiles is {x}')
print(f'and the average execution time is {maxAge_ntuple_time} secs')

# Another method to get the oldest age using namedtuple and map
old_Person = Age(*map(max, zip(*ageList)))
print(f'Average Age by name operator is {old_Person.age}')

Oldest Person's' age for a set of 10000 random profiles is 116
and the average execution time is 0.006711062174115796 secs
Average Age by name operator is 116


In [11]:
# namedtuple operation -- Mean Current Location

# Actually this method of obtaining mean current location is flawed. Suppose we have a positive
# latitude of X, and another a latitude of negative X, mean location ends up as 0. The preferred
# method of caluclating mean current location in terms of latitude and longitude involves inclusion
# of trigonometrical functions

C_Location = namedtuple('C_Location', ('lat', 'long'))
total = 0
num_ops = 1000
for i in range(num_ops):
    start = perf_counter()
    locList = [C_Location(*item['current_location']) for item in dp]
    x = C_Location(*map(mean, zip(*locList)))
    total += perf_counter() - start
curLoc_ntuple_time = total/num_ops
print(f'The mean of current location clusters for {sample_size} random profiles is {x}')
print(f'and the average execution time is {curLoc_ntuple_time} secs')

The mean of current location clusters for 10000 random profiles is C_Location(lat=Decimal('0.73033404675'), long=Decimal('0.4043659084'))
and the average execution time is 0.026524745655944572 secs


In [12]:
C_Location = namedtuple('C_Location', ('lat', 'long'))
C_Location.__doc__ = '''\
Named Tuple having 2 fields, first field for latitude and the second one for longitude'''

C_Location.lat.__doc__ = '''\
This field stores the latitude of the current location.'''

C_Location.long.__doc__ = '''\
The "long" field stores the longitude data of the current location'''

In [13]:
# namedtuple operation -- Largest Blood Type
# Slightly hard or I don't know how to use namedtuple to the best here

Bloodgroup = namedtuple('Bloodgroup', 'blood_group')
total = 0
num_ops = 1000
for i in range(num_ops):
    start = perf_counter()
    bgList = [Bloodgroup(item['blood_group']) for item in dp]
    x = Counter(bgList).most_common(1)[0]
    ntime = Counter(bgList).most_common(1)[0][1]
    bg_x = x[0].blood_group
    total += perf_counter() - start
bgroup_ntuple_time = total/num_ops
print(f'The largest occurring blood_group type for {sample_size} random profiles is "{bg_x}"')
print(f'which occurred {ntime} times')
print(f'and the average execution time is {bgroup_ntuple_time} secs')

The largest occurring blood_group type for 10000 random profiles is "B-"
which occurred 1316 times
and the average execution time is 0.008271285743205226 secs


In [14]:
Bloodgroup.__doc__ = '''\
Named Tuple for storing the blood type data.'''

Bloodgroup.blood_group.__doc__ = '''\
The blood type is recorded as a string.'''

In [15]:
# Dictionary operation -- Average Age
# Find Average age among profiles

total = 0
num_ops = 1000
for i in range(num_ops):
    start = perf_counter()
    x = mean([item['age'] for item in dp])
    total += perf_counter() - start
aveAge_dict_time = total/num_ops
print(f'Average Age for a set of {sample_size} random profiles is {x}')
print(f'and the average execution time is {aveAge_dict_time} secs')

Average Age for a set of 10000 random profiles is 57.2926
and the average execution time is 0.001009060434953426 secs


In [16]:
# Dictionary operation -- Oldest Person's Age

# The application of dictionary in calculating oldest person's age seems not justifiable
# as most of the time consuming operations are done in extracting maximum out a list that is
# obtained from already existing dictionary.

total = 0
num_ops = 1000
for i in range(num_ops):
    start = perf_counter()
    x = max([item['age'] for item in dp])
    total += perf_counter() - start
maxAge_dict_time = total/num_ops
print(f'Oldest Person\'s\' age for a set of {sample_size} random profiles is {x}')
print(f'and the average execution time is {maxAge_dict_time} secs')

Oldest Person's' age for a set of 10000 random profiles is 116
and the average execution time is 0.0005958481500347261 secs


In [17]:
# Dictionary operation -- Oldest Person's Age

# The application of dictionary in calculating oldest person's age seems not justifiable
# as most of the time consuming operations are done in extracting maximum out a list that is
# obtained from already existing dictionary.

total = 0
num_ops = 1000
for i in range(num_ops):
    start = perf_counter()
    max_age = 0
    for item in dp:
        if item['age'] > max_age:
            max_age = item['age']
    x = max_age
    total += perf_counter() - start
maxAge_dict_time2 = total/num_ops
print(f'Oldest Person\'s\' age for a set of {sample_size} random profiles is {x}')
print(f'and the average execution time is {maxAge_dict_time2} secs')

Oldest Person's' age for a set of 10000 random profiles is 116
and the average execution time is 0.0008164763520690031 secs


In [18]:
loc_dict = []
for item in dp:
    loc_dict.append(item['current_location'])  

In [19]:
list(map(mean, zip(*loc_dict)))

[Decimal('0.73033404675'), Decimal('0.4043659084')]

In [20]:
# Dictionary operation -- Mean Current Location

total = 0
num_ops = 1000
for i in range(num_ops):
    start = perf_counter()
    x = C_Location(*map(mean, zip(*locList)))
    total += perf_counter() - start
curLoc_dict_time = total/num_ops
print(f'The mean of current location clusters for {sample_size} random profiles is {x}')
print(f'and the average execution time is {curLoc_dict_time} secs')

The mean of current location clusters for 10000 random profiles is C_Location(lat=Decimal('0.73033404675'), long=Decimal('0.4043659084'))
and the average execution time is 0.019658097313062173 secs


In [21]:
# Dictionary operation -- Largest Blood Type

total = 0
num_ops = 1000
for i in range(num_ops):
    start = perf_counter()
    bgDict = dict()
    for item in dp:
        key = item.get('blood_group')
        if key not in bgDict:
            bgDict[key] = 1
        else:
            bgDict[key] += 1
    rev_dict = {v:k for k,v in bgDict.items()}
    x = rev_dict[max(rev_dict)]
    ntime = max(rev_dict)
    total += perf_counter() - start
bgroup_dict_time = total/num_ops
print(f'The largest occurring blood_group type for {sample_size} random profiles is "{x}"')
print(f'which occurred {ntime} times')
print(f'and the average execution time is {bgroup_dict_time} secs')

The largest occurring blood_group type for 10000 random profiles is "B-"
which occurred 1316 times
and the average execution time is 0.002723633549132501 secs


In [22]:
# Test case 1 -- Checks if date entered into the age_teller(birthDate) function is valid
# first checks for age calculation using year 2000 with same month as now should give 21
# second checks for the output to see if we received None
# third loops over the profile, fetches the birthdate, and checks if the instance is a date type

def test_date_entry():
    assert age_teller(date(2000, 7, 4)) == 21
    assert age_teller(date(2000, 7, 4)) != None
    for item in dp:
        assert isinstance(item['birthdate'], date)

test_date_entry()

In [30]:
# Test case 2 -- check if Age is a namedtuple; likewise for C_Location and Bloodgroup
# Tests if the C_Location contains two fields, latitude and longitude

def test_namedtuple_entry():
    assert (isinstance(Age, tuple) and isinstance(getattr(Age, '__dict__', None), collections.Mapping) and
           getattr(Age, '_fields', None))
    assert (isinstance(Age, tuple) and hasattr(Age, '_asdict') and hasattr(Age, '_fields'))
    assert isinstance(C_Location, namedtuple)
    assert isinstance(Bloodgroup, namedtuple)
    assert len(C_Location._fields) == 2

test_namedtuple_entry()

AssertionError: 

In [39]:
# Test case 3 -- Test if namedtuple operations faster than dictionary operations

def test_is_namedtuple_faster():
    if (aveAge_ntuple_time > aveAge_dict_time and
            maxAge_ntuple_time > maxAge_dict_time and
            curLoc_ntuple_time > curLoc_dict_time and
            bgroup_ntuple_time > bgroup_dict_time
           ):
        print(f'Dictionaries seems faster than namedtuple operations')

test_is_namedtuple_faster()

Dictionaries seems faster than namedtuple operations


In [60]:
# Test case 4 -- Check if docstrings for the namedtuples are matching what we wrote
# This seems funny but I couldnt finish the assignment on time.

def test_docstring_for_namedtuples():
    assert (Age.__doc__ == 'Age is a namedtuple with a single field variable "age"')

test_docstring_for_namedtuples()