In [1]:
import pandas as pd
import numpy as np
#import hvplot.pandas
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv('./data/names_df.csv')
df['Name'] = df.Name.str.strip('--')
df['Number'] = df.Number.str.strip('L00').astype(np.int)
df['Year'] = df.Year.str.strip('Y').astype(np.int)
idx = df[df.Year == 88].index[0]
df.at[idx, 'Year'] = 1888

In [3]:
df['Sex'] = df.Sex.apply(lambda x: 0 if x=='F' else 1)
df.drop(columns=['Year'], inplace=True)
groups = df.groupby(['Sex', 'Name'], as_index=False).sum()
group = df.groupby(['Name']).sum()
group.drop(columns='Sex', inplace=True)

In [4]:
male = groups[groups.Sex==1].drop(columns=['Sex'])
male.set_index('Name', inplace=True)
fem = groups[groups.Sex==0].drop(columns=['Sex'])
fem.set_index('Name', inplace=True)

In [5]:
%%time
for name, row in group.iterrows():
    total = row.Number
    try: 
        n_male = male.at[name, 'Number']
        group.loc[name, 'Male'] = int(n_male)
    except:
        group.loc[name, 'Male'] = 0
        
group['Female'] = group.Number - group.Male
group.head()

Unnamed: 0_level_0,Number,Male,Female
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aaban,107,107.0,0.0
Aabha,35,0.0,35.0
Aabid,10,10.0,0.0
Aabir,5,5.0,0.0
Aabriella,32,0.0,32.0


In [6]:
male_vect = CountVectorizer(analyzer='char_wb', ngram_range=(2,8))
male_counts = male_vect.fit_transform(male.index)
male_counts = male_counts.sum(axis=0).A1
m_vocab = male_vect.get_feature_names()
mdf = pd.Series(male_counts, index=m_vocab)

fem_vect = CountVectorizer(analyzer='char_wb', ngram_range=(2,8))
fem_counts = fem_vect.fit_transform(fem.index)
fem_counts = fem_counts.sum(axis=0).A1
f_vocab = fem_vect.get_feature_names()
fdf = pd.Series(fem_counts, index=f_vocab)

In [7]:
vocab = set(m_vocab) | set(f_vocab)
vocab = sorted(vocab)

In [8]:
%%time

d = dict()
for v in vocab:
    try:
        m = mdf.loc[v]
    except:
        m = 0
    try:
        f = fdf.loc[v]
    except:
        f = 0
        
    d[v] = [m, f]

CPU times: user 18.9 s, sys: 43.8 ms, total: 19 s
Wall time: 19 s


In [9]:
p = pd.DataFrame.from_dict(d, orient='index', columns=['Male', 'Female'])

In [10]:
p['p_male'] = p.Male / (p.Male + p.Female)
p['p_female'] = p.Female / (p.Male + p.Female)

In [11]:
p.head()

Unnamed: 0,Male,Female,p_male,p_female
a,3682,7604,0.326245,0.673755
aa,157,257,0.379227,0.620773
aab,3,2,0.6,0.4
aaba,1,0,1.0,0.0
aaban,1,0,1.0,0.0


In [12]:
T = len(df.Sex)
M = df.Sex.sum()
F = T - M
priors = [F/T, M/T]
priors[::-1]

[0.40857603790789565, 0.5914239620921043]

In [15]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB
from sklearn.linear_model import SGDClassifier, LogisticRegression, LogisticRegressionCV
from sklearn.pipeline import Pipeline

In [16]:
df.head()

Unnamed: 0,Name,Sex,Number
0,Mary,0,7065
1,Anna,0,2604
2,Emma,0,2003
3,Elizabeth,0,1939
4,Minnie,0,1746


In [18]:
y = df.Sex
#X = df.loc[:, ['Name', 'Number']]
X = df.Name

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=33)

In [29]:
%%time
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,8))
X_train_counts = cwb_vectorizer.fit_transform(X_train)

model_mnb = MultinomialNB(alpha=0.001).fit(X_train_counts, y_train)

X_test_counts = cwb_vectorizer.transform(X_test)

cv_scores = cross_val_score(model_mnb, X_train_counts, y_train, cv=10)

CPU times: user 1min 46s, sys: 8.83 s, total: 1min 54s
Wall time: 1min 8s


In [30]:
score = model_mnb.score(X_test_counts, y_test)

score, np.mean(cv_scores), np.median(cv_scores)

(0.8927969282966126, 0.8930034567635445, 0.8929422690991278)

In [32]:
%%time
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,10))
X_train_counts = cwb_vectorizer.fit_transform(X_train)

model_mnb = MultinomialNB(alpha=0.001).fit(X_train_counts, y_train)

X_test_counts = cwb_vectorizer.transform(X_test)

cv_scores2 = cross_val_score(model_mnb, X_train_counts, y_train, cv=10)

CPU times: user 1min 48s, sys: 9.09 s, total: 1min 57s
Wall time: 1min 11s


In [33]:
score2 = model_mnb.score(X_test_counts, y_test)

score2, np.mean(cv_scores2), np.median(cv_scores2)

(0.8931840086456604, 0.8932658399811044, 0.8932312808089731)

In [34]:
%%time
mnb_pipe = Pipeline([
    ('vect', CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,10))),
    ('clf', MultinomialNB(alpha=0.001)),
])

mnb_pipe.fit(X_train, y_train)

CPU times: user 43.3 s, sys: 1.58 s, total: 44.8 s
Wall time: 41.1 s


In [37]:
score = model_mnb.score(X_test_counts, y_test)
score

0.8931840086456604

In [48]:
mnb_pipe2 = Pipeline([
    ('vect', CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,10))),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB(alpha=0.001)),
])

mnb_pipe2.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='char_wb', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 10), preprocessor=None, stop_words=None,
        ...ear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True))])

In [49]:
mnb_pipe2.score(X_test, y_test)

0.8951427910831236

In [40]:
%%time
logrcv_pipe = Pipeline([
    ('vect', CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,10))),
    ('logrCV', LogisticRegressionCV(cv=5, random_state=11, fit_intercept=False, max_iter=5000, n_jobs=-1)),
])

logrcv_pipe.fit(X_train, y_train)

CPU times: user 49min 41s, sys: 4.19 s, total: 49min 45s
Wall time: 47min 37s


In [41]:
logrcv_pipe.score(X_test, y_test)

0.9006164709183156

In [42]:
%%time
logrcv_pipe2 = Pipeline([
    ('vect', CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,10))),
    ('tfidf', TfidfTransformer()),
    ('logrCV', LogisticRegressionCV(cv=5, random_state=11, fit_intercept=False, max_iter=5000, n_jobs=-1)),
])

logrcv_pipe2.fit(X_train, y_train)

CPU times: user 12min 16s, sys: 6.79 s, total: 12min 23s
Wall time: 17min 35s


In [43]:
logrcv_pipe2.score(X_test, y_test)

0.9006970044137551

In [44]:
%%time
logrcv_pipe3 = Pipeline([
    ('vect', CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,10))),
    ('tfidf', TfidfTransformer()),
    ('logrCV', LogisticRegressionCV(cv=5, random_state=11, fit_intercept=True, max_iter=5000, n_jobs=-1)),
])

logrcv_pipe3.fit(X_train, y_train)

CPU times: user 31min 57s, sys: 4.65 s, total: 32min 1s
Wall time: 45min 13s


In [45]:
logrcv_pipe3.score(X_test, y_test)

0.9006736237215308

In [49]:
%%time
logrcv_pipe4 = Pipeline([
    ('vect', CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,10))),
    ('tfidf', TfidfTransformer()),
    ('logrCV', LogisticRegressionCV(cv=10, random_state=11, fit_intercept=False, max_iter=5000, n_jobs=-1)),
])

logrcv_pipe4.fit(X_train, y_train)

CPU times: user 10min 59s, sys: 6.77 s, total: 11min 6s
Wall time: 31min 10s


In [50]:
logrcv_pipe4.score(X_test, y_test)

0.9006866129949888

In [51]:
%%time
logrcv_pipe5 = Pipeline([
    ('vect', CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,12))),
    ('tfidf', TfidfTransformer()),
    ('logrCV', LogisticRegressionCV(cv=5, random_state=11, fit_intercept=False, max_iter=5000, n_jobs=-1)),
])

logrcv_pipe5.fit(X_train, y_train)

CPU times: user 13min 9s, sys: 6.76 s, total: 13min 16s
Wall time: 19min 37s


In [52]:
logrcv_pipe5.score(X_test, y_test)

0.9006840151402972

In [53]:
%%time
logrcv_pipe6 = Pipeline([
    ('vect', CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,15))),
    ('tfidf', TfidfTransformer()),
    ('logrCV', LogisticRegressionCV(cv=5, random_state=11, fit_intercept=False, max_iter=5000, n_jobs=-1)),
])

logrcv_pipe6.fit(X_train, y_train)

CPU times: user 11min 47s, sys: 5.89 s, total: 11min 53s
Wall time: 19min 34s


In [54]:
logrcv_pipe6.score(X_test, y_test)

0.9006918087043719

In [24]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2,11))
X_train_counts = cwb_vectorizer.fit_transform(X_train)

model_mnb = MultinomialNB().fit(X_train_counts, y_train)

X_test_counts = cwb_vectorizer.transform(X_test)

model_mnb.score(X_test_counts, y_test)

0.8877882644512162

In [46]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2,10))
X_train_counts = cwb_vectorizer.fit_transform(X_train)

model_gnb = MultinomialNB().fit(X_train_counts, y_train)

X_test_counts = cwb_vectorizer.transform(X_test)

model_gnb.score(X_test_counts, y_test)

0.8877882644512162