In [46]:
import numpy as np
import pandas as pd
import datetime


## I slutändan blir det här en tuple med ints som mappas till olika beskrivande namn
# När fine-tuning av fördelningar kan anses vara färdig går det att formulera en mycket enklare och kortare funktion
# för att med sannolikhetsfördelning generera ett nästintill identiskt resultat 

# namn (och email) behöver 'tokenifieras', annars är all annan data nominaldata (i vissa fall ordinal) med små integers

def gen_gender():
  "2% probability for non-binary"
  if np.random.uniform() < 0.98:
    return np.random.choice([1, 2], p=[0.5, 0.5])
  else:
    return 3

def gen_age():
  "Normal distribution, mean of 42 and a standard deviation of 20"
  age = int(np.random.normal(loc=42, scale=20))
  while age <= 15 or age >= 100:
    age = int(np.random.normal(loc=42, scale=20))
  return age
  # There is probably a way to avoid having to write this twice while still using normal distribution, but it works for now
#TODO: Check correlation with age for civilstånd, utbildningsnivå etc.

def gen_civilstånd(age):
  if age <= 25: np.random.choice([1, 5, 6], p=[0.01, 0.98, 0.01])
  if age >= 50: np.random.choice([1, 2, 3, 4, 5, 6,], p=[0.5, 0.05, 0.15, 0.15, 0.10, 0.05])
  return np.random.choice([1, 2, 3, 4, 5, 6], p=[0.35, 0.02, 0.09, 0.04, 0.47, 0.03])
  # TODO: age should clearly be a used more thoughtful

def gen_utbildningsnivå(age):
  if age <= 17: return 1 # Grundskolenivå garanterad för age <= 16
  elif age in range(18,19): return np.random.randint(1, high=3) # Slumpar studentexamen för 18-19 år
  elif age >= 19: return np.random.choice([1, 2, 3, 4, 5, 6], p=[0.01, 0.01, 0.26, 0.21, 0.270, 0.24])
  # allt annat random TODO: implement probability for each age

def gen_sysselsättning(age):
  if age >= 66: return 5 # Garanterar pension över 66
  elif age >= 60: return np.random.choice([1, 4, 5, 6]) # Möjlig pension över 60, ej möjliga studier
  elif age <= 17: return 2 # Garanterar studier under 17
  elif age >= 18 or age <= 30: return np.random.choice([1, 2, 3, 4], p=[0.25, 0.65, 0.05, 0.05]) #Högre sannolikhet att vara student
  return np.random.choice([1, 2, 3, 4, 5, 6], p=[0.7, 0.1, 0.088, 0.086, 0.0, 0.02]) # Speglar arbetslöshet och sjukskrivning i sverige 2022

def gen_boende():
  return np.random.choice([1, 2, 3, 4, 5], p=[0.25, 0.25, 0.49, 0.005, 0.005])
  # VERY ROUGH estimate, probably not even remotely close to actual distribution
  # TODO: add parameters for probability from age and other stuff

def gen_bor_med(age, civilstånd):
  if age <= 17: return 4 # Garanterar boende med förälder under 17
  if age >= 31 or age <= 50: return np.random.choice([1, 2, 3, 4, 5, 6], p=[0.2, 0.1, 0.2, 0.002, 0.002, 0.496]) # ökad sannolikhet att bo med familj i åldersspann 31-50
  if civilstånd == 2 or 3: return np.random.choice([1, 3, 5], p=[0.98, 0.01, 0.01])
  return np.random.choice([1, 2, 3, 5, 6], p=[0.362, 0.4, 0.02, 0.1, 0.118]) # ensamboende speglar scb 2019, resten lekmannaestimering

## ------- Health  ------- ##

def gen_vardagstillfredsställelse():
  "Normal distribution with mean 3 and std 1.6"
  val = int(np.random.normal(loc=3, scale=1.6))
  while val < 1 or val > 5: # prevent values out of range
    val = int(np.random.normal(loc=3, scale=1.6))
  return val
  #TODO: find probability from other parameters such as living situation and age

def gen_health():
  return np.random.choice([1,2,3,4,5], p=[0.05,0.2,0.5,0.2,0.05]) # lazy normal distribution

#TODO:
## ------ Daily/weekly time ------ #

# Arbete	Skötsel	Lek	Rekreation	Sömn
# Time should add up 1440 (24 hours)

# Tid_ensam	Tid_familj	Tid_vänner	Tid_övriga
# time does not have to add up to anything but sum cannot be > 1440

## ------- Name & Contact ------ ##

def gen_name():
  # TODO: conformist gender specific names + first, last name commonly matched

  # Just a test set-up. This wouldn't be an issue as is, but it doesn't take cultural matching first and last names into calculations and is currently limited to some regional variations
  first_bucket = [m_namelist_sweden_gpt, f_namelist_sweden_gpt, m_namelist_scandinavic_gpt, m_namelist_scandinavic_gpt, m_namelist_slavic_gpt, f_namelist_scandinavic_gpt, f_norway_list]
  first_name = first_bucket[np.random.randint(0,len(first_bucket))]
  first_name = first_name[np.random.randint(0,len(first_name))]

  last_bucket = [last_name_swe, last_name_gpt_eur0, last_name_gpt_eur, last_name_gpt_eur2, last_name_gpt_asia, last_name_gpt_mena]
  last_name = last_bucket[np.random.randint(0,len(last_bucket))]
  last_name = last_name[np.random.randint(0,len(last_name))]
  return first_name + ' ' + last_name

def gen_email(name, age):
  
  # 5% to spell name incorrectly in email (boomer factor)
  if age > 59 and np.random.choice([0, 1], p=[0.85, 0.15]):
    spelling_mistake(name)

  # add birth year to end of mail
  birth_year = datetime.date.today().year - age

  domän = np.random.choice(['gmail.com', 'hotmail.com',
                            'live.se', 'live.com', 'outlook.com',
                            'yahoo.se', 'icloud.com'],
                            p=[0.55, 0.05, 0.05, 0.05, 0.19, 0.01, 0.1])

  # 50% to have older domain if age > 60
  if age > 60 and np.random.randint(2) == 1:
      if np.random.randint(2) ==1: suffix = str(birth_year)[-2:] # 50% to remove first two digits from birth year
      domän = np.random.choice(['telia.se', 'spray.se', 'glocalnet.se', 'bredband.net', 'regeringen.se'],
      p=[0.30, 0.23, 0.23, 0.23, 0.01])

  # 90% to remove century from birth year if age < 60
  elif age < 60:
      if np.random.choice([0,1], p=[0.1, 0.9]) == 1: suffix = str(birth_year)[-2:]

  # 50% to remove entire birth year 25% to add a random int instead
  suffix_modifier = np.random.choice([0,1,3], p=[0.5, 0.25, 0.25])

  if suffix_modifier == 0: suffix = ''
  elif suffix_modifier == 1: suffix = np.random.randint(1,9)
  else: suffix = str(birth_year)[-2:]

	# 50% to have a first name "nickname", then 20% to only have first letter and .
  if np.random.choice([0,1], p=[0.5,0.5]) == 1:
    # TODO: there should be some mapping for this to determine likely ways to shorten names

    first, last = name.split()[0], name.split()[1]
    # If length > 7, only use first 3 letters
    if len(first) > 7:
      first = first[:3:]
    # if length > 6 only use first 4 letters 
    if len(first) > 6:
      first = first[:4:] 

    # 20% to have only first letter in first name
    if np.random.choice([0,1], p=[0.8, 0.2]) == 1:
      first, last = name.split()[0], name.split()[1]
      first = first[:1:] + np.random.choice(['_','.'])
    
    name = first + last



  # replace blank space and make lowercase
  x = np.random.choice(['_','__','-','.',''],p=[0.1,0.1,0.2,0.1,0.5])
  name = name.lower().replace(' ',x )

  # 10% to add other characters to email
  if np.random.choice([0,1], p=[0.9, 0.1]) == 1:
    prefix = '_'
  else:
    prefix = ''
  

  ################### Replace name entirely #####################

  # 5% to have a millenial email if millenial
  if birth_year in range(1989,1998) and np.random.choice([0,1], p=[0.95, 0.05]) == 1:
    return millenial_mail(birth_year)

  # 0.5% to have a completely random and anonymous email
  if np.random.choice([0,1], p=[0.995, 0.005]) == 1:
    return anonymize_mail1(is_apple=False) + '@' + domän

  # 1% to have an email hidden by apple services
  if np.random.choice([0,1], p=[0.99, 0.01]) == 1:
    return anonymize_mail1(is_apple=True)

  # 0.5% to have an "asdf" (when users just type "randomly") email:
  if np.random.choice([0,1], p=[0.995, 0.005]) == 1:
    return asdf_mail() + '@' + domän

  ################################################################
      
  return str(prefix) + str(name)+ str(suffix) + '@' + domän


def gen_phone():
  # challenge# 1: needs to be unique
  # for now it will have a probability of 2.474631929433396e-07 to be duplicare
  a = str(np.random.choice([70, 72, 73, 76, 79]))
  b = np.random.randint(100, high=999)
  c = np.random.randint(100, high=999)

  def add_zero(n):
    if n < 10: n = '00' + str(n)
    elif n < 100: n = '0' + str(n)
    return str(n)

  phone = str(0) + a + str(np.random.randint(0,9)) + add_zero(b) + add_zero(c)
  return phone


## --- Misc stuff for mail --- ##

def anonymize_mail1(is_apple):
  length = np.random.randint(8, 12)
  chars = np.array(list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'))
  if is_apple == True: return ''.join(np.random.choice(chars, size=10)) + '@privaterelay.appleid.com'
  return ''.join(np.random.choice(chars, size=length))

def asdf_mail():
	randy = np.random.choice([0,1,2])
	n = np.random.randint(1,9)

	if randy == 0: # The asdf:er
		chars = ['asdf','ddasff','assd','fffasas', 'asdasdf', 'asdasdasd']
		return (np.random.choice(chars)[-n:] + (np.random.choice(chars)[-n:]))

	elif randy == 1: # The "aaaaaa":er
		a = np.random.choice(['a','w','e','q'])
		return np.random.choice([a * n, a * (n+n)])

	else: # The obnoxious
		return np.random.choice(['bajs', 'jajajaja', 'dinmamma', 'kkk'])

def spelling_mistake(name):
    # Shuffles last and next to last letter of first name
    s = name.split()[0]
    i = len(s)-1
    j = len(s)-2
    s_list = list(s)
    s_list[i], s_list[j] = s_list[j], s_list[i]
    return(''.join(s_list) + name.split()[1])

def millenial_mail(birth_year):
	deco = np.random.choice(['xX_', '_', '', ''])
	word_list1 = ['star', 'cool', 'dark', 'moon', 'knight', 'hunter']
	word_list2 = ['boy', 'girl', 'dude', 'killer', 'fire', 'master']
	separator = ['_', '', '', '-']
	domain_mil = '@' + np.random.choice(['gmail.com','live.com','live.se'])	
	stupid_mail = str(deco) + word_list1[np.random.randint(len(word_list1))] + separator[np.random.randint(len(separator))] + word_list2[np.random.randint(len(word_list2))] + str(deco[::-1])
	if stupid_mail[-1::] == 'x':
	  return stupid_mail + domain_mil
	return stupid_mail + str(birth_year)[-2:] + domain_mil


name = gen_name()
age = gen_age()
civilstånd = gen_civilstånd(age)
print(name, age)
print(gen_email(name, age))
print(gen_phone())
print(gen_boende())
print(gen_bor_med(age, civilstånd))

Alexander Fredriksson 22
alexanderfredriksson@gmail.com
0797384794
3
6


In [2]:
# This should be zipped/compressed in other format, as of now it works but it's messy

m_namelist_sweden_gpt = ["Lars", "Anders", "Johan", "Karl", "Per", "Mikael", "Jan", "Henrik", "Björn", "Erik", "Ingvar", "Hans", "Gustaf", "Nils", "Sven", "Olof", "Tomas", "Bo", "Urban", "Stig", "Lennart", "Ruben", "Kjell", "Emil", "Oskar", "Rolf", "Fredrik", "Magnus", "Jakob", "Petter", "Andreas", "Daniel", "Pelle", "Mathias", "Jonathan", "Viktor", "Isak", "David", "Anton", "Emilio", "Joel", "Adam", "Robin", "Leif", "Max", "Filip", "Alexander", "Martin", "William", "Göran", "Carl", "Mats", "Samuel", "Peder", "Thomas", "John", "Staffan", "Markus", "Gustav", "Bengt", "Jens", "Richard", "Jim", "Mattias", "Patrik", "Jörgen", "Olivia", "Stefan", "Christoffer", "Jimmie", "Marcus", "Elias", "Maurice", "Ulf", "Christian", "Håkan", "Marcus", "Gunnar", "Bertil", "Tobias", "Ingemar", "Mikael", "Linus", "Dennis", "Alfred", "Hans", "Sebastian", "Arne", "Jesper", "Janne", "Christer"]
f_namelist_sweden_gpt = ["Maria", "Anna", "Margareta", "Elisabeth", "Eva", "Birgitta", "Ingrid", "Karin", "Kristina", "Lena", "Marianne", "Inger", "Greta", "Annika", "Ulla", "Gunilla", "Lilian", "Astrid", "Sofia", "Agneta", "Yvonne", "Monica", "Elvira", "Ingegerd", "Bodil", "Annie", "Linn", "Ester", "Carina", "Sara", "Helena", "Emma", "Therese", "Lise-Lotte", "Kicki", "Lillemor", "Lottie", "Henny", "Lena", "Maud", "Jenny", "Lissie", "Livia", "Loulou", "Luna", "Linda", "Anita", "Lotta", "Louise", "Gunvor", "Linnea", "Inga", "Irene", "Stina", "Liv", "Mimmi", "Madeleine", "Maj-Britt", "Anette", "Mona", "Pia", "Inger", "Fanny", "Gudrun", "Barbro", "Ann-Sofie", "Charlotte", "Hanna", "Ida", "Rut", "Anna-Lena", "Ellen", "Cecilia", "Ebba", "Anita", "Linda", "Maj", "Frida", "Tova", "Viktoria", "Camilla", "Emilia", "Marta", "Nina", "Märta", "Maja", "Amanda", "Rebecca", "Julia", "Johanna"]
f_norway_list = ['Nora', 'Norah', 'Emma', 'Ella', 'Maja', 'Maia', 'Maya', 'Olivia', 'Emilie', 'Sofie', 'Sophie', 'Leah', 'Sofia', 'Sophia', 'Ingrid', 'Frida', 'Sara', 'Sarah', 'Zara', 'Tiril', 'Selma', 'Ada', 'Hedda', 'Amalie', 'Anna', 'Alma', 'Hanna', 'Eva', 'Mia', 'Thea', 'Live', 'Ida', 'Astrid', 'Ellinor', 'ilde', 'Linnea', 'Iben', 'Aurora', 'Mathilde', 'Mathilde', 'Saga', 'Solveig', 'Marie', 'Eline', 'Oline', 'Maria']

m_namelist_scandinavic_gpt = ["Oliver", "Noah", "William", "Emil", "Liam", "Elias", "Oskar", "Axel", "Lucas", "Filip",
"Erik", "Isak", "Mikael", "Jakob", "Leo", "Simon", "Theodor", "Viktor", "Sebastian", "Alexander",
"Hampus", "Hugo", "Adrian", "Max", "David", "Fredrik", "Linus", "Edward", "Olle", "Arvid",
"Carl", "Henrik", "Kevin", "Elvin", "Nils", "Joel", "Magnus", "Daniel", "Mathias", "Anton",
"Linus", "Andreas", "Sondre", "Gustav", "Niklas", "Hannes", "Emilio", "Johan", "August", "Aron",
"Vemund", "Jens", "Markus", "Björn", "Johannes", "Viggo", "Ludvig", "Rasmus", "Benjamin", "Isac",
"Tomas", "Mats", "Eirik", "Olav", "Peder", "Kai", "Sander", "Anders", "Stian", "Even",
"Eirik", "Sverre", "Jakob", "Johan", "Tommy", "Truls", "Espen", "Marius", "Krister", "Geir"]

f_namelist_scandinavic_gpt = ["Emma", "Sofia", "Ella", "Maja", "Aurora", "Linn", "Nora", "Ida", "Molly", "Hanna",
"Lena", "Alma", "Violet", "Ingrid", "Frida", "Emilia", "Leah", "Livia", "Signe", "Linnea",
"Amanda", "Evy", "Tuva", "Thea", "Sara", "Camilla", "Julia", "Elise", "Amalie", "Eline",
"Ada", "Marie", "Emma", "Sofie", "Johanna", "Mia", "Ester", "Isabelle", "Martine", "Inger",
"Karoline", "Maria", "Ragnhild", "Synne", "Silje", "Annie", "Ingrid", "Tiril", "Eli", "Iben",
"Marte", "Andrea", "Tonje", "Rikke", "Anette", "Lise", "Elina", "Ellen", "Stine", "Ingvild",
"Helene", "Karen", "Guro", "Sandra", "Marianne", "Maren", "Turid", "Lene", "Bente", "Anne",
"Ingrid", "Bodil", "Grete", "Rita", "Astrid", "Berit", "Kari", "Gro", "Lillian", "Unni"]

m_namelist_slavic_gpt = ["Ivan", "Vladimir", "Sergei", "Alexander", "Dmitri", "Maxim", "Nikolai", "Andrei", "Viktor", "Pavel",
"Yuri", "Mikhail", "Boris", "Stanislav", "Oleg", "Evgeny", "Anton", "Konstantin", "Aleksandr", "Semyon"]

f_namelist_slavic_gpt = ["Anna", "Olga", "Maria", "Elena", "Natalia", "Tatiana", "Svetlana", "Irina", "Nadezhda", "Larisa",
"Yelena", "Lyubov", "Alla", "Veronika", "Irina", "Galina", "Ksenia", "Ekaterina", "Alina", "Anastasia"]


last_name_swe = ['Svensson', 'Andersson', 'Gustavsson', 'Gjertsson', 'Nilsson', 'Knutsson', 'Fredriksson', 'Lundgren', 'Fransson', 'Lundgren', 'Lindstedt', 'Mattsson', 'Ohlsson', 'Bengtsson', 'Lundberg', 'Lindberg']

last_name_gpt_eur0 =['Oliveira', 'Sandberg', 'Martin', 'Sousa', 'Garrido', 'Fonseca', 'Bjork', 'Gouveia', 'Pérez', 'Navarro', 'Silva', 'Pascual', 'Matos', 'Serrano', 'Ferreira', 'Jimenez', 'Lopez', 'Blanco', 'Romero', 'Edwards', 'Gonzalez', 'Santana', 'Coelho', 'Barros', 'Vieira', 'Larsson', 'Gomes', 'Castillo', 'Lundberg', 'Karlsson', 'Rodriguez', 'Berglund', 'Olsson', 'Cuesta', 'Diaz', 'Alonso', 'Andersson', 'Gomez', 'Bergstrom', 'Bengtsson', 'del Rio', 'Jonsson', 'Sanchez', 'Vazquez', 'Ribeiro', 'Jansson', 'Hidalgo', 'Castro', 'Petersson', 'Hansson', 'Carlsson', 'Vidal', 'Morales', 'Rivera', 'Santos', 'Faria', 'Gimenez', 'Sjogren', 'Perez', 'Andrade', 'Pacheco', 'Jakobsson', 'Berggren', 'Forsberg', 'Mendes', 'Ruiz', 'Jesus', 'Cardoso', 'Rodríguez', 'Eklund', 'Nunes', 'Ramalho', 'Persson', 'Eriksson', 'Gustafsson', 'Wallin', 'Soderberg', 'Molina', 'Garcia', 'Moreno', 'Alves', 'Rodrigues', 'Rocha', 'Barbosa', 'Alvarez', 'Martinez', 'Johansson', 'Svensson', 'Moura', 'Monteiro', 'Fernandez', 'Lopes', 'Pettersson', 'Suarez', 'Ortega', 'Costa', 'Pereira', 'Dominguez', 'Ramirez', 'Gómez', 'Teixeira', 'Ramos', 'Reyes', 'Lundqvist', 'Sundberg', 'Carvalho', 'Cabrera', 'Fernández', 'Martins', 'Torres', 'Pinto', 'Nilsson', 'Almeida', 'Lima', 'Neves', 'Hernandez', 'Vera', 'Marques']

last_name_gpt_eur = ["Smith", "Johnson", "Williams", "Jones", "Brown", "Davis", "Miller", "Wilson", "Moore", "Taylor", "Anderson", "Thomas", "Jackson", "White", "Harris", "Martin", "Thompson", "Garcia", "Martinez", "Robinson"] # This doesn't seem very scandinavian
last_name_gpt_eur2 = ["Schmidt", "Schneider", "Fischer", "Müller", "Weber", "Schulz", "Wagner", "Becker", "Hoffmann", "Schäfer", "Koch", "Bauer", "Richter", "Klein", "Wolf", "Schröder", "Neumann", "Schwarz", "Braun", "Hofmann"] # This is also very weirdly distributed for a 'european name' prompt
last_name_gpt_asia = ["Kim", "Lee", "Park", "Wong", "Choi", "Li", "Zhang", "Lin", "Wu", "Chen", "Yuan", "Zhao", "Yang", "Huang", "Liu", "Guo", "Wang", "Sun", "Xu", "Gao"]
last_name_gpt_mena = ["Al-Saud", "Al-Shaikh", "Al-Maktoum", "Al-Sabah", "Al-Thani", "Bin Laden", "Al-Nasser", "Al-Hussein", "Al-Assad", "Al-Khalifa", "Al-Mahmoud", "Al-Hajri", "Al-Dhabi", "Al-Otaibi", "Al-Rumaihi", "Al-Ghanim", "Al-Mulla", "Al-Hariri", "Al-Mutairi", "Al-Sayed", "Hussein", "Mohammed", "Ahmed", "Ali", "Mahmoud", "Omar", "Abdullah", "Youssef", "Sayed", "Ibrahim", "Khan", "Hasan", "Qassem", "Rashid", "Tariq", "Jamal", "Faris", "Nashat", "Taleb", "Ghanem"]

In [47]:
def generate_person(): # should make a class
    # Pure ints
    age = gen_age() + 10 # add 10 years to generation for testing older people
    phone = gen_phone()
    gendr = gen_gender()
    civil = gen_civilstånd(age)
    utbil = gen_utbildningsnivå(age)
    syssl = gen_sysselsättning(age)
    boend = gen_boende()
    bormd = gen_bor_med(age, civil)
    vardt = gen_vardagstillfredsställelse()
    hälsa = gen_health()

    # Token this
    name = gen_name()
    mail = gen_email(name, age)
    return (age,name,mail,phone,gendr,civil,utbil,syssl,boend,bormd,vardt,hälsa)

iterations = 1000

person_list = []
for n in range(iterations):
    p = generate_person()
    person_list.append(p)

person_list[0]

df = pd.DataFrame(person_list,columns=['Ålder','Namn','Email', 'Telefon', 'Kön', 'Civilstånd', 'Utbildningsnivå', 'Sysselsättning', 'Boende', 'Tillsammans_med', 'Vardagstillfredsställelse', 'Hälsa'])

df

Unnamed: 0,Ålder,Namn,Email,Telefon,Kön,Civilstånd,Utbildningsnivå,Sysselsättning,Boende,Tillsammans_med,Vardagstillfredsställelse,Hälsa
0,79,Konstantin Wolf,konstantinwolf@telia.se,0798442144,2,5,3,5,3,6,4,3
1,69,Hedda Torres,UxDVj6UqmJ@privaterelay.appleid.com,0767960995,2,1,3,5,3,6,2,2
2,84,Andrei Pinto,andrei-pinto@outlook.com,0796335459,2,4,5,5,3,6,4,3
3,64,Espen Lindberg,_espenlindberg@gmail.com,0703535793,2,4,5,4,3,6,1,2
4,56,Erik Wong,e.wong@outlook.com,0731752483,1,5,4,2,3,2,4,3
...,...,...,...,...,...,...,...,...,...,...,...,...
995,45,Ludvig Alvarez,ludvigalvarez78@outlook.com,0726676254,2,5,3,2,2,3,3,2
996,54,Zara Al-Hariri,zara-al-hariri8@icloud.com,0762901722,1,5,3,2,1,2,3,1
997,59,Livia Lundgren,livia_lundgren@gmail.com,0738743989,2,5,6,2,1,3,2,3
998,43,Lene Andersson,lene-andersson@hotmail.com,0732707294,1,1,3,2,3,6,2,3


In [48]:
# Also possible to make this waay shorter once all mapping values (string names) have been decided upon

# Assign gender
df['Kön'] = df['Kön'].replace({1: 'Kvinna', 2: 'Man', 3: 'Annat',})

# Assign civilstånd
mapping = {1: 'Gift/Sambo',
           2: 'Änka/Änkling ej omgift',
           3: 'Skild, ej omgift',
           4: 'Särbo',
           5: 'Ogift',
           6: 'Annat',}
df['Civilstånd'] = df['Civilstånd'].map(mapping)

# Assign utbildningsnivå
mapping = {1: 'Folkskolenivå eller lägre',
           2: 'Grundskola/Realexamen',
           3: 'Studentexamen',
           4: 'Fölkhögskola/Yrkesutbildning',
           5: 'Ofullständig akademisk utbildning',
           6: 'Universitet/Högskola med examen',}
df['Utbildningsnivå'] = df['Utbildningsnivå'].map(mapping)

# Assign sysselsättning
mapping = {1: 'Arbete', 
           2: 'Studier', 
           3: 'Arbetslös', 
           4: 'Sjukskriven/sjukersättning', 
           5: 'Ålderspensionär', 
           6: 'Icke-förvärvsarbetande av annan orsak',}
df['Sysselsättning'] = df['Sysselsättning'].map(mapping)

# Assign boende
mapping = {1: 'Hyresrätt', 
           2: 'Bostadrätt', 
           3: 'Radhus, villa eller liknande', 
           4: 'Särskilt boende', 
           5: 'Annat',}
df['Boende'] = df['Boende'].map(mapping)


# Assign living situation
mapping = {1: 'Ensamboende', 
           2: 'Make/maka/sambo', 
           3: 'Barn', 
           4: 'Förälder', 
           5: 'Annan',
           6: 'Familj (Partner och barn/ föräldrar och syskon),'}
df['Tillsammans_med'] = df['Tillsammans_med'].map(mapping)

df.head(40)

Unnamed: 0,Ålder,Namn,Email,Telefon,Kön,Civilstånd,Utbildningsnivå,Sysselsättning,Boende,Tillsammans_med,Vardagstillfredsställelse,Hälsa
0,79,Konstantin Wolf,konstantinwolf@telia.se,798442144,Man,Ogift,Studentexamen,Ålderspensionär,"Radhus, villa eller liknande","Familj (Partner och barn/ föräldrar och syskon),",4,3
1,69,Hedda Torres,UxDVj6UqmJ@privaterelay.appleid.com,767960995,Man,Gift/Sambo,Studentexamen,Ålderspensionär,"Radhus, villa eller liknande","Familj (Partner och barn/ föräldrar och syskon),",2,2
2,84,Andrei Pinto,andrei-pinto@outlook.com,796335459,Man,Särbo,Ofullständig akademisk utbildning,Ålderspensionär,"Radhus, villa eller liknande","Familj (Partner och barn/ föräldrar och syskon),",4,3
3,64,Espen Lindberg,_espenlindberg@gmail.com,703535793,Man,Särbo,Ofullständig akademisk utbildning,Sjukskriven/sjukersättning,"Radhus, villa eller liknande","Familj (Partner och barn/ föräldrar och syskon),",1,2
4,56,Erik Wong,e.wong@outlook.com,731752483,Kvinna,Ogift,Fölkhögskola/Yrkesutbildning,Studier,"Radhus, villa eller liknande",Make/maka/sambo,4,3
5,52,Linda Sanchez,linda.sanchez@gmail.com,707438876,Man,"Skild, ej omgift",Studentexamen,Studier,Bostadrätt,"Familj (Partner och barn/ föräldrar och syskon),",2,3
6,81,Even Hofmann,evenhofmann42@gmail.com,768654678,Kvinna,Ogift,Studentexamen,Ålderspensionär,Bostadrätt,"Familj (Partner och barn/ föräldrar och syskon),",5,2
7,36,Christoffer Wong,_chrwong8@live.com,705188895,Kvinna,Ogift,Ofullständig akademisk utbildning,Studier,Bostadrätt,"Familj (Partner och barn/ föräldrar och syskon),",4,5
8,64,Karoline Neumann,karoline-neumann59@bredband.net,724483918,Man,Ogift,Ofullständig akademisk utbildning,Icke-förvärvsarbetande av annan orsak,Hyresrätt,"Familj (Partner och barn/ föräldrar och syskon),",1,3
9,51,Carl Al-Khalifa,carlal-khalifa72@icloud.com,760538735,Man,Ogift,Studentexamen,Arbete,Bostadrätt,"Familj (Partner och barn/ föräldrar och syskon),",4,4


In [None]:
# Ideas for name generation

# Assigning row name string
mapping = m # = TODO: function for randomizing first and lastname. Gender and age shoukd be a modifier.

    # token modifier for name generation:
    modifier = [0,0,0]
    # x: gender scalar: input bool
    # returns a float from -1 to 1

    # input val is bool as the modifier make predictions from categorically male and female names
    # non-binary genders will be unaffected by this scalar and have same probability for any name,
    # while 1 tends to favor labeled female names and -1 labeled male names

    # higher output val = p(fem,bobo) (eg. first name += p(name end character is 'a')) => 'Emma Svensson'
    # lower output val = p(masc,kiki) (eg. increase p(first name end in consonant)) => 'Emmanuel Svensson'

    # y: age scalar: input int
    # returns an int from 0 to 54

    # val reflects name popularity from list item age value
    # (eg. map popular names from different years within a given region (probably sweden))

    # higher output val = more probable to match first and last name with age trends
    # maybe easter egg names for like 500+ year old (vikings or smt) or -200 year (sci-fi names)

    # z: creative scalar: input float
    # returns an int from 0 to 54

    # higher output val = increases probability for more unconventional name functions to trigger
    # (eg. get words from dictionary of places or items not featured in name data) => 'Emballage Krutsnus')
    # (eg. funky shuffle and rotation) => 'Mema Venssons'

# final token for name algo
name_map = [0,0,0]
