# Data Preparation for Surname Classification 

In [2]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

In [3]:
args = Namespace(
    raw_dataset_csv="data/surnames/surnames.csv",
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="data/surnames/surnames_with_splits.csv",
    seed=1337
)

In [4]:
# Read raw data
surnames = pd.read_csv(args.raw_dataset_csv, header=0)

In [5]:
surnames.head()

Unnamed: 0,surname,nationality
0,Woodford,English
1,Coté,French
2,Kore,English
3,Koury,Arabic
4,Lebzak,Russian


In [6]:
surnames.nationality.value_counts()  # 18 surname classes; imbalanced dataset

nationality
English       2972
Russian       2373
Arabic        1603
Japanese       775
Italian        600
German         576
Czech          414
Spanish        258
Dutch          236
French         229
Chinese        220
Irish          183
Greek          156
Polish         120
Korean          77
Scottish        75
Vietnamese      58
Portuguese      55
Name: count, dtype: int64

In [7]:
# Splitting train by nationality
# Create dict
by_nationality = collections.defaultdict(list)
for _, row in surnames.iterrows():
    by_nationality[row.nationality].append(row.to_dict())

In [8]:
by_nationality

defaultdict(list,
            {'English': [{'surname': 'Woodford', 'nationality': 'English'},
              {'surname': 'Kore', 'nationality': 'English'},
              {'surname': 'Essop', 'nationality': 'English'},
              {'surname': 'Jefferson', 'nationality': 'English'},
              {'surname': 'Dorrington', 'nationality': 'English'},
              {'surname': 'Jeffries', 'nationality': 'English'},
              {'surname': 'Douthwaite', 'nationality': 'English'},
              {'surname': 'Readle', 'nationality': 'English'},
              {'surname': 'Jones', 'nationality': 'English'},
              {'surname': 'Topham', 'nationality': 'English'},
              {'surname': 'Bellamy', 'nationality': 'English'},
              {'surname': 'Leggett', 'nationality': 'English'},
              {'surname': 'Lilley', 'nationality': 'English'},
              {'surname': 'Ayliffe', 'nationality': 'English'},
              {'surname': 'Twiggs', 'nationality': 'English'},
            

In [9]:
# Create split data
final_list = []
np.random.seed(args.seed)
for _, item_list in sorted(by_nationality.items()):
    np.random.shuffle(item_list)
    n = len(item_list)
    n_train = int(args.train_proportion*n)
    n_val = int(args.val_proportion*n)
    n_test = int(args.test_proportion*n)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
    for item in item_list[n_train+n_val:]:
        item['split'] = 'test'  
    
    # Add to final list
    final_list.extend(item_list)

In [10]:
final_list

[{'surname': 'Totah', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Abboud', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Fakhoury', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Srour', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Sayegh', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Cham', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Haik', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Kattan', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Khouri', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Antoun', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Wasem', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Srour', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Seif', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Guirguis', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Sarkis', 'nationality': 'Arabic', 'split': 'train'},
 

In [11]:
# Write split data to file
final_surnames = pd.DataFrame(final_list)

In [12]:
final_surnames.split.value_counts()

split
train    7680
test     1660
val      1640
Name: count, dtype: int64

In [13]:
final_surnames[final_surnames.split=="val"].nationality.value_counts()  # E.g., English: 2972*0.15=445

nationality
English       445
Russian       355
Arabic        240
Japanese      116
Italian        90
German         86
Czech          62
Spanish        38
Dutch          35
French         34
Chinese        33
Irish          27
Greek          23
Polish         18
Korean         11
Scottish       11
Portuguese      8
Vietnamese      8
Name: count, dtype: int64

In [14]:
final_surnames.head()

Unnamed: 0,surname,nationality,split
0,Totah,Arabic,train
1,Abboud,Arabic,train
2,Fakhoury,Arabic,train
3,Srour,Arabic,train
4,Sayegh,Arabic,train


In [15]:
# Write munged data to CSV
final_surnames.to_csv(args.output_munged_csv, index=False)