In [194]:
import pandas as pd
import numpy as np
import hvplot.pandas
from collections import Counter

In [195]:
df = pd.read_csv('./data/names_df.csv')
df.head()

Unnamed: 0,Name,Sex,Number,Year
0,--Mary--,F,L007065,Y1880
1,--Anna--,F,L002604,Y1880
2,--Emma--,F,L002003,Y1880
3,--Elizabeth--,F,L001939,Y1880
4,--Minnie--,F,L001746,Y1880


In [196]:
df['Name'] = df.Name.str.strip('--')
df['Number'] = df.Number.str.strip('L00').astype(np.int)
df['Year'] = df.Year.str.strip('Y').astype(np.int)
# 'Female' == 0, 'Male' == 1
#df['Sex'] = df.Sex.apply(lambda x: 0 if x=='F' else 1)

In [197]:
df.groupby(['Year', 'Sex']).count().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Number
Year,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1
88,F,1,1
1880,F,942,942
1880,M,1058,1058
1881,F,938,938
1881,M,997,997


In [198]:
df[df.Year == 88]

Unnamed: 0,Name,Sex,Number,Year
17502,Mary,F,11754,88


In [199]:
idx = df[df.Year == 88].index[0]
df.at[idx-1, 'Year'], df.at[idx+1, 'Year']

(1887, 1888)

In [200]:
samp = df[df.Year == 1888]
samp[samp.Name == 'Mary']

Unnamed: 0,Name,Sex,Number,Year
19223,Mary,M,5,1888


In [201]:
df.at[idx, 'Year'] = 1888

In [202]:
grouped = df.groupby(['Year', 'Sex']).count()

In [203]:
grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Number
Year,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1
1880,F,942,942
1880,M,1058,1058
1881,F,938,938
1881,M,997,997
1882,F,1028,1028


In [204]:
grouped = df.groupby(['Name', 'Sex'], as_index = False).sum()
grouped.head()

Unnamed: 0,Name,Sex,Number,Year
0,Aaban,M,107,20124
1,Aabha,F,35,10068
2,Aabid,M,10,4019
3,Aabir,M,5,2016
4,Aabriella,F,32,10070


In [205]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1924665 entries, 0 to 1924664
Data columns (total 4 columns):
Name      object
Sex       object
Number    int64
Year      int64
dtypes: int64(2), object(2)
memory usage: 58.7+ MB


In [206]:
df.head()

Unnamed: 0,Name,Sex,Number,Year
0,Mary,F,7065,1880
1,Anna,F,2604,1880
2,Emma,F,2003,1880
3,Elizabeth,F,1939,1880
4,Minnie,F,1746,1880


### Most common names
1.	What was the most common male name in 1989?

In [207]:
df[(df.Year == 1989) & (df.Sex == 'M')].iloc[0, :].Name

'Michael'

2.	What was the most common female name in 1989?

In [208]:
df[(df.Year == 1989) & (df.Sex == 'F')].iloc[0, :].Name

'Jessica'

3.	Write code to output a list of the most common male names by year.

In [209]:
def most_common_male_names(year, n=10):
    s = df[(df.Year == int(year)) & (df.Sex == 'M')].Name[:n]
    return list(s)

In [210]:
most_common_male_names(1990, 15)

['Michael',
 'Christopher',
 'Matthew',
 'Joshua',
 'Daniel',
 'David',
 'Andrew',
 'James',
 'Justin',
 'Joseph',
 'Ryan',
 'John',
 'Robert',
 'Nicholas',
 'Anthony']

4.	Write code to output a list of the most common female names by year.


In [211]:
def most_common_female_names(year, n=10):
    s = df[(df.Year == int(year)) & (df.Sex == 'F')].Name[:n]
    return list(s)

In [212]:
most_common_female_names(1994)

['Jessica',
 'Ashley',
 'Emily',
 'Samantha',
 'Sarah',
 'Taylor',
 'Brittany',
 'Amanda',
 'Elizabeth',
 'Megan']

### Name diversity
5.	How many unique names are there in the entire dataset?

In [213]:
df.Name.nunique()

97311

6.	How many unique male names were there in 1989?

In [214]:
df[(df.Year == 1989) & (df.Sex == 'M')].Name.nunique()

9227

7.	How many unique female names were there in 1989?

In [215]:
df[(df.Year == 1989) & (df.Sex == 'F')].Name.nunique()

14546

8.	How many unique names (male and female) were there in 1989?

In [216]:
df[(df.Year == 1989)].Name.nunique()

21621

9.	Why is the number of unique names in 1989 (problem #8) smaller than the sum of unique male names (problem #6) and unique female names (problem #7) for that year?

In [217]:
M89 = df[(df.Year == 1989) & (df.Sex == 'M')].Name
F89 = df[(df.Year == 1989) & (df.Sex == 'F')].Name
T89 = df[(df.Year == 1989)].Name

print('Unique Names - M+F: ', M89.nunique() + F89.nunique(), '\n' + 'Unique Names - All: ', T89.nunique())

Unique Names - M+F:  23773 
Unique Names - All:  21621


In [218]:
common_names = (set(M89) & set(F89) & set(T89))
print(len(common_names))
print(list(common_names)[:12])

2152
['Calyn', 'Deanna', 'Davon', 'Keon', 'Jacque', 'Choua', 'Cristen', 'Lashon', 'Shayla', 'Wayne', 'Fabian', 'Alma']


10.	Write code to output a list of names that show up in both the male and female groups in 1989. How many names show up in both groups for that year?

In [219]:
def gender_neutral_names(year=1989):
    M89 = df[(df.Year == year) & (df.Sex == 'M')].Name
    F89 = df[(df.Year == year) & (df.Sex == 'F')].Name
    names = list(set(M89) & set(F89))
    return names

In [220]:
n = gender_neutral_names(1989)
len(n)

2152

11.	Show how the answer to problem #10 helps explain the answer to problem #9.


In [221]:
common_names = (set(M89) & set(F89) & set(T89))
print(len(common_names))
print(list(common_names)[:12])

2152
['Calyn', 'Deanna', 'Davon', 'Keon', 'Jacque', 'Choua', 'Cristen', 'Lashon', 'Shayla', 'Wayne', 'Fabian', 'Alma']


### Unisex names
12.	Provide a list of the most common unisex names through 2017. Describe and justify your criteria for a name to be considered unisex. What other criteria could you use for a name to be unisex?

In [223]:
for year in df.Year.unique():
    for sex in ('F', 'M'):
        yidx = df[(df.Year == year) & (df.Sex == sex)].index
        df.at[yidx, 'ann_pct'] = df.loc[yidx, 'Year'].rank(method='first', pct=True)
        
df.head()

Unnamed: 0,Name,Sex,Number,Year,ann_pct
0,Mary,F,7065,1880,0.001062
1,Anna,F,2604,1880,0.002123
2,Emma,F,2003,1880,0.003185
3,Elizabeth,F,1939,1880,0.004246
4,Minnie,F,1746,1880,0.005308


In [188]:
def unisex_names(year=1989, threshold=1.0):
    M89 = df[(df.Year == year) & (df.Sex == 'M') & (df.ann_pct < threshold)].Name
    F89 = df[(df.Year == year) & (df.Sex == 'F') & (df.ann_pct < threshold)].Name
    names = list(set(M89) & set(F89))
    return names

def most_common_unisex_names(years=[], threshold=1.0):
    countr = Counter()
    if not years:
        years = df.Year.unique()
    for year in years:
        names = unisex_names(year, threshold)
        for name in names:
            countr[name] += 1
    return countr

In [224]:
c95 = most_common_unisex_names(threshold=0.95)

In [229]:
c90 = most_common_unisex_names(threshold=0.90)

In [None]:
c75 = most_common_unisex_names(threshold=0.75)
c50 = most_common_unisex_names(threshold=0.50)

In [230]:
for a, b, c, d in zip(c95.most_common(25), c90.most_common(25), c75.most_common(25), c50.most_common(25)):
    print(a, b, c ,d)

('Jean', 138) ('Jean', 138) ('Jean', 138) ('James', 138)
('William', 138) ('William', 138) ('James', 138) ('Jessie', 138)
('Johnnie', 138) ('Johnnie', 138) ('Jessie', 138) ('Francis', 138)
('Leslie', 138) ('Leslie', 138) ('Francis', 138) ('William', 138)
('Marion', 138) ('Marion', 138) ('William', 138) ('Lee', 138)
('Sidney', 138) ('Sidney', 138) ('Johnnie', 138) ('Marion', 138)
('Joseph', 138) ('Joseph', 138) ('Leslie', 138) ('Johnnie', 137)
('Tommie', 138) ('Tommie', 138) ('Lee', 138) ('Jean', 133)
('Jesse', 138) ('Ollie', 138) ('Marion', 138) ('John', 132)
('Ollie', 138) ('James', 138) ('Sidney', 138) ('Leslie', 131)
('James', 138) ('Jessie', 138) ('Charles', 137) ('Charles', 130)
('Jessie', 138) ('Francis', 138) ('John', 137) ('Tommie', 130)
('Francis', 138) ('Lee', 138) ('Tommie', 137) ('Sidney', 128)
('Lee', 138) ('June', 137) ('Ollie', 137) ('Ollie', 126)
('Henry', 137) ('Charles', 137) ('Joseph', 136) ('Joseph', 126)
('June', 137) ('John', 137) ('Ira', 135) ('Charlie', 126)
('C

In [235]:
df.head()
cdf = df.copy()

In [233]:
f = df[df.Sex == 'F']
m = df[df.Sex == 'M']

In [105]:
ff = Counter(f.Name.apply(lambda x: x[0]))
fl = Counter(f.Name.apply(lambda x: x[-1]))
ffl = Counter(f.Name.apply(lambda x: x[0] + x[-1]))

In [106]:
mf = Counter(m.Name.apply(lambda x: x[0]))
ml = Counter(m.Name.apply(lambda x: x[-1]))
mfl = Counter(m.Name.apply(lambda x: x[0] + x[-1]))

In [107]:
list(zip([i[0] for i in ffl.most_common(15)], [i[0] for i in mfl.most_common(15)]))

[('Aa', 'Dn'),
 ('La', 'Jn'),
 ('Sa', 'Kn'),
 ('Ma', 'An'),
 ('Ta', 'Cn'),
 ('Da', 'Tn'),
 ('Ca', 'Bn'),
 ('Ka', 'Sn'),
 ('Me', 'Ln'),
 ('Je', 'Je'),
 ('Ce', 'De'),
 ('Ja', 'Rn'),
 ('Ae', 'En'),
 ('Ra', 'Mn'),
 ('Ea', 'Ce')]

In [152]:
df[(df.Year == 1994) & (df.Sex == 'M')].head()

Unnamed: 0,Name,Sex,Number,Year,ann_pct
1186573,Michael,M,4447,1994,9.8e-05
1186574,Christopher,M,348,1994,0.000195
1186575,Matthew,M,33646,1994,0.000293
1186576,Joshua,M,31372,1994,0.00039
1186577,Tyler,M,30477,1994,0.000488


In [237]:
df.drop(columns=['Number', 'ann_pct'], inplace=True)

In [262]:
df.head()

Unnamed: 0,Name,Sex,Year
0,Mary,F,1880
1,Anna,F,1880
2,Emma,F,1880
3,Elizabeth,F,1880
4,Minnie,F,1880


## Machine learning exercise

Design and create a machine learning model that classifies a name as either male or female based on the characteristics of the name. Write up a report with screenshots in a text document or Jupyter notebook that describes the following:

•	the input data and its quality

•	any data exploration and any interesting insights

•	any preprocessing or cleaning of the data that was needed, including how you chose to label the data (i.e., how you labeled a name male or female as the ground truth for the model)

•	the features used by your model and why you chose them

•	your selection of machine learning algorithm and why it was chosen

•	the accuracy of your model and any relevant metrics that describe model performance

•	any model parameter tuning used to improve performance

•	any relevant discussion, model interpretation, future steps, or further exploration needed

In addition, prepare a short slideshow presentation (~10 min.) of an overview of the machine learning exercise. You will present this to the analytics team.
There are no restrictions on the classification model that you use; however, be sure to keep in mind your ability to adequately explain and interpret the results.
Good luck!


In [261]:
df.head()

Unnamed: 0,Name,Sex,Year
0,Mary,F,1880
1,Anna,F,1880
2,Emma,F,1880
3,Elizabeth,F,1880
4,Minnie,F,1880


In [263]:
Y = df.loc[:, 'Sex']
X = df.drop(columns=['Sex'])

In [264]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(1, 2))
X_vect = cwb_vectorizer.fit_transform(X.Name)

In [272]:
cwb_vectorizer.vocabulary

In [265]:
cwb_vectorizer.get_feature_names()

[' ',
 ' A',
 ' B',
 ' C',
 ' D',
 ' E',
 ' F',
 ' G',
 ' H',
 ' I',
 ' J',
 ' K',
 ' L',
 ' M',
 ' N',
 ' O',
 ' P',
 ' Q',
 ' R',
 ' S',
 ' T',
 ' U',
 ' V',
 ' W',
 ' X',
 ' Y',
 ' Z',
 'A',
 'Aa',
 'Ab',
 'Ac',
 'Ad',
 'Ae',
 'Af',
 'Ag',
 'Ah',
 'Ai',
 'Aj',
 'Ak',
 'Al',
 'Am',
 'An',
 'Ao',
 'Ap',
 'Aq',
 'Ar',
 'As',
 'At',
 'Au',
 'Av',
 'Aw',
 'Ax',
 'Ay',
 'Az',
 'B',
 'Ba',
 'Bb',
 'Be',
 'Bg',
 'Bh',
 'Bi',
 'Bj',
 'Bl',
 'Bn',
 'Bo',
 'Br',
 'Bt',
 'Bu',
 'Bw',
 'By',
 'C',
 'Ca',
 'Cc',
 'Ce',
 'Ch',
 'Ci',
 'Cj',
 'Cl',
 'Cm',
 'Cn',
 'Co',
 'Cr',
 'Cs',
 'Cu',
 'Cy',
 'Cz',
 'D',
 'Da',
 'Dc',
 'Dd',
 'De',
 'Dh',
 'Di',
 'Dj',
 'Dk',
 'Dl',
 'Dm',
 'Dn',
 'Do',
 'Dq',
 'Dr',
 'Ds',
 'Du',
 'Dv',
 'Dw',
 'Dy',
 'Dz',
 'E',
 'Ea',
 'Eb',
 'Ec',
 'Ed',
 'Ee',
 'Ef',
 'Eg',
 'Eh',
 'Ei',
 'Ej',
 'Ek',
 'El',
 'Em',
 'En',
 'Eo',
 'Ep',
 'Eq',
 'Er',
 'Es',
 'Et',
 'Eu',
 'Ev',
 'Ew',
 'Ex',
 'Ey',
 'Ez',
 'F',
 'Fa',
 'Fe',
 'Fh',
 'Fi',
 'Fj',
 'Fl',
 'Fo',
 'Fr',
 'Fu',