In [1]:
import pandas as pd
import numpy as np
#import hvplot.pandas
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv('./data/names_df.csv')
df['Name'] = df.Name.str.strip('--')
df['Number'] = df.Number.str.strip('L00').astype(np.int)
df['Year'] = df.Year.str.strip('Y').astype(np.int)
idx = df[df.Year == 88].index[0]
df.at[idx, 'Year'] = 1888

In [3]:
df['Sex'] = df.Sex.apply(lambda x: 0 if x=='F' else 1)
df.drop(columns=['Year'], inplace=True)
groups = df.groupby(['Sex', 'Name'], as_index=False).sum()
group = df.groupby(['Name']).sum()
group.drop(columns='Sex', inplace=True)

In [4]:
male = groups[groups.Sex==1].drop(columns=['Sex'])
male.set_index('Name', inplace=True)
fem = groups[groups.Sex==0].drop(columns=['Sex'])
fem.set_index('Name', inplace=True)

In [5]:
for name, row in group.iterrows():
    total = row.Number
    try: 
        n_male = male.at[name, 'Number']
        group.loc[name, 'Male'] = int(n_male)
    except:
        group.loc[name, 'Male'] = 0
        
group['Female'] = group.Number - group.Male
group.head()

Unnamed: 0_level_0,Number,Male,Female
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aaban,107,107.0,0.0
Aabha,35,0.0,35.0
Aabid,10,10.0,0.0
Aabir,5,5.0,0.0
Aabriella,32,0.0,32.0


In [6]:
male_vect = CountVectorizer(analyzer='char_wb', ngram_range=(2,8))
male_counts = male_vect.fit_transform(male.index)
male_counts = male_counts.sum(axis=0).A1
m_vocab = male_vect.get_feature_names()
mdf = pd.Series(male_counts, index=m_vocab)

fem_vect = CountVectorizer(analyzer='char_wb', ngram_range=(2,8))
fem_counts = fem_vect.fit_transform(fem.index)
fem_counts = fem_counts.sum(axis=0).A1
f_vocab = fem_vect.get_feature_names()
fdf = pd.Series(fem_counts, index=f_vocab)

In [7]:
vocab = set(m_vocab) | set(f_vocab)
vocab = sorted(vocab)

In [8]:
%%time

d = dict()
for v in vocab:
    try:
        m = mdf.loc[v]
    except:
        m = 0
    try:
        f = fdf.loc[v]
    except:
        f = 0
        
    d[v] = [m, f]

CPU times: user 27.9 s, sys: 74.7 ms, total: 27.9 s
Wall time: 28 s


In [9]:
p = pd.DataFrame.from_dict(d, orient='index', columns=['Male', 'Female'])

In [10]:
p['p_male'] = p.Male / (p.Male + p.Female)
p['p_female'] = p.Female / (p.Male + p.Female)

In [11]:
p.head()

Unnamed: 0,Male,Female,p_male,p_female
a,3682,7604,0.326245,0.673755
aa,157,257,0.379227,0.620773
aab,3,2,0.6,0.4
aaba,1,0,1.0,0.0
aaban,1,0,1.0,0.0


In [12]:
T = len(df.Sex)
M = df.Sex.sum()
F = T - M
priors = [F/T, M/T]
priors[::-1]

[0.40857603790789565, 0.5914239620921043]

In [13]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB
from sklearn.linear_model import SGDClassifier, LogisticRegression, LogisticRegressionCV
from sklearn.pipeline import Pipeline

In [14]:
df.head()

Unnamed: 0,Name,Sex,Number
0,Mary,0,7065
1,Anna,0,2604
2,Emma,0,2003
3,Elizabeth,0,1939
4,Minnie,0,1746


In [15]:
y = df.Sex
X = df.Name

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=33)

In [17]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,8))
X_train_counts = cwb_vectorizer.fit_transform(X_train)

model_mnb = MultinomialNB(alpha=0.001).fit(X_train_counts, y_train)

X_test_counts = cwb_vectorizer.transform(X_test)

model_mnb.score(X_test_counts, y_test)

0.8927969282966126

In [19]:
scores = cross_val_score(model_mnb, X_train_counts, y_train, cv=5)
scores

array([0.89310174, 0.89305952, 0.89307251, 0.89211095, 0.89271136])

In [21]:
%%time
mnb_pipe = Pipeline([
    ('vect', CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,10))),
    ('clf', MultinomialNB(alpha=0.001)),
])

mnb_pipe.fit(X_train, y_train)

CPU times: user 51.2 s, sys: 1.98 s, total: 53.2 s
Wall time: 53.3 s


In [23]:
cv_scores = cross_val_score(mnb_pipe, X_train, y_train, cv=5)

KeyboardInterrupt: 

In [None]:
gauss_pipe = Pipeline([
    ('vect', CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,10))),
    ('clf', MultinomialNB(alpha=0.001)),
])

gauss_pipe.fit(X_train, y_train)

In [51]:
gauss_pipe.score(X_test, y_test)

0.8931840086456604

In [48]:
mnb_pipe2 = Pipeline([
    ('vect', CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,10))),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB(alpha=0.001)),
])

mnb_pipe2.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='char_wb', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 10), preprocessor=None, stop_words=None,
        ...ear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True))])

In [49]:
mnb_pipe2.score(X_test, y_test)

0.8951427910831236

In [55]:
%%time
logrcv_pipe = Pipeline([
    ('vect', CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,10))),
    ('logrCV', LogisticRegressionCV(cv=5, random_state=11, fit_intercept=False, max_iter=5000, n_jobs=-1)),
])

logrcv_pipe.fit(X_test, y_test)

CPU times: user 11min 42s, sys: 1.95 s, total: 11min 44s
Wall time: 12min 1s


In [56]:
logrcv_pipe.score(X_test, y_test)

0.9103636217211827

In [57]:
%%time
logrcv_pipe2 = Pipeline([
    ('vect', CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,10))),
    ('tfidf', TfidfTransformer()),
    ('logrCV', LogisticRegressionCV(cv=5, random_state=11, fit_intercept=False, max_iter=5000, n_jobs=-1)),
])

logrcv_pipe2.fit(X_test, y_test)

CPU times: user 2min 25s, sys: 2.19 s, total: 2min 27s
Wall time: 4min 15s


In [58]:
logrcv_pipe2.score(X_test, y_test)

0.9103428388836499

In [59]:
%%time
logrcv_pipe3 = Pipeline([
    ('vect', CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,10))),
    ('tfidf', TfidfTransformer()),
    ('logrCV', LogisticRegressionCV(cv=5, random_state=11, fit_intercept=True, max_iter=5000, n_jobs=-1)),
])

logrcv_pipe3.fit(X_test, y_test)

CPU times: user 6min 33s, sys: 2.05 s, total: 6min 35s
Wall time: 9min 37s


In [60]:
logrcv_pipe3.score(X_test, y_test)

0.9103428388836499

In [24]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2,10))
X_train_counts = cwb_vectorizer.fit_transform(X_train)

model_mnb = MultinomialNB().fit(X_train_counts, y_train)

X_test_counts = cwb_vectorizer.transform(X_test)

model_mnb.score(X_test_counts, y_test)

0.8877882644512162

In [None]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2,10))
X_train_counts = cwb_vectorizer.fit_transform(X_train)

model_gnb = GaussianNB().fit(X_train_counts.toarray(), y_train)

X_test_counts = cwb_vectorizer.transform(X_test)

model_gnb.score(X_test_counts.toarray(), y_test)