In [5]:
import pandas as pd
import numpy as np
import hvplot.pandas
from collections import Counter

In [6]:
df = pd.read_csv('./data/names_df.csv')
df.head()

Unnamed: 0,Name,Sex,Number,Year
0,--Mary--,F,L007065,Y1880
1,--Anna--,F,L002604,Y1880
2,--Emma--,F,L002003,Y1880
3,--Elizabeth--,F,L001939,Y1880
4,--Minnie--,F,L001746,Y1880


In [7]:
df['Name'] = df.Name.str.strip('--')
df['Number'] = df.Number.str.strip('L00').astype(np.int)
df['Year'] = df.Year.str.strip('Y').astype(np.int)
# 'Female' == 0, 'Male' == 1
#df['Sex'] = df.Sex.apply(lambda x: 0 if x=='F' else 1)
df.head()

Unnamed: 0,Name,Sex,Number,Year
0,Mary,F,7065,1880
1,Anna,F,2604,1880
2,Emma,F,2003,1880
3,Elizabeth,F,1939,1880
4,Minnie,F,1746,1880


In [8]:
df.groupby(['Year', 'Sex']).count().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Number
Year,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1
88,F,1,1
1880,F,942,942
1880,M,1058,1058
1881,F,938,938
1881,M,997,997


In [9]:
df[df.Year == 88]

Unnamed: 0,Name,Sex,Number,Year
17502,Mary,F,11754,88


In [10]:
idx = df[df.Year == 88].index[0]
df.at[idx-1, 'Year'], df.at[idx+1, 'Year']

(1887, 1888)

In [11]:
samp = df[df.Year == 1888]
samp[samp.Name == 'Mary']

Unnamed: 0,Name,Sex,Number,Year
19223,Mary,M,5,1888


In [12]:
df.at[idx, 'Year'] = 1888

In [13]:
grouped = df.groupby(['Year', 'Sex']).count()

In [14]:
grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Number
Year,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1
1880,F,942,942
1880,M,1058,1058
1881,F,938,938
1881,M,997,997
1882,F,1028,1028


In [15]:
grouped = df.groupby(['Name', 'Sex'], as_index = False).sum()
grouped.head()

Unnamed: 0,Name,Sex,Number,Year
0,Aaban,M,107,20124
1,Aabha,F,35,10068
2,Aabid,M,10,4019
3,Aabir,M,5,2016
4,Aabriella,F,32,10070


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1924665 entries, 0 to 1924664
Data columns (total 4 columns):
Name      object
Sex       object
Number    int64
Year      int64
dtypes: int64(2), object(2)
memory usage: 58.7+ MB


In [17]:
df.head()

Unnamed: 0,Name,Sex,Number,Year
0,Mary,F,7065,1880
1,Anna,F,2604,1880
2,Emma,F,2003,1880
3,Elizabeth,F,1939,1880
4,Minnie,F,1746,1880


### Most common names
1.	What was the most common male name in 1989?

In [18]:
df[(df.Year == 1989) & (df.Sex == 'M')].iloc[0, :].Name

'Michael'

2.	What was the most common female name in 1989?

In [19]:
df[(df.Year == 1989) & (df.Sex == 'F')].iloc[0, :].Name

'Jessica'

3.	Write code to output a list of the most common male names by year.

In [20]:
def most_common_male_names(year, n=10):
    s = df[(df.Year == int(year)) & (df.Sex == 'M')].Name[:n]
    return list(s)

In [21]:
most_common_male_names(1990, 15)

['Michael',
 'Christopher',
 'Matthew',
 'Joshua',
 'Daniel',
 'David',
 'Andrew',
 'James',
 'Justin',
 'Joseph',
 'Ryan',
 'John',
 'Robert',
 'Nicholas',
 'Anthony']

4.	Write code to output a list of the most common female names by year.


In [22]:
def most_common_female_names(year, n=10):
    s = df[(df.Year == int(year)) & (df.Sex == 'F')].Name[:n]
    return list(s)

In [23]:
most_common_female_names(1994)

['Jessica',
 'Ashley',
 'Emily',
 'Samantha',
 'Sarah',
 'Taylor',
 'Brittany',
 'Amanda',
 'Elizabeth',
 'Megan']

### Name diversity
5.	How many unique names are there in the entire dataset?

In [24]:
df.Name.nunique()

97311

6.	How many unique male names were there in 1989?

In [25]:
df[(df.Year == 1989) & (df.Sex == 'M')].Name.nunique()

9227

7.	How many unique female names were there in 1989?

In [26]:
df[(df.Year == 1989) & (df.Sex == 'F')].Name.nunique()

14546

8.	How many unique names (male and female) were there in 1989?

In [27]:
df[(df.Year == 1989)].Name.nunique()

21621

9.	Why is the number of unique names in 1989 (problem #8) smaller than the sum of unique male names (problem #6) and unique female names (problem #7) for that year?

In [28]:
M89 = df[(df.Year == 1989) & (df.Sex == 'M')].Name
F89 = df[(df.Year == 1989) & (df.Sex == 'F')].Name
T89 = df[(df.Year == 1989)].Name

print('Unique Names - M+F: ', M89.nunique()+F89.nunique(), '\n' + 'Unique Names - All: ', T89.nunique(),
      '\n' + 'Diff: ' + str((M89.nunique()+F89.nunique()) - T89.nunique()))

Unique Names - M+F:  23773 
Unique Names - All:  21621 
Diff: 2152


In [29]:
common_names = (set(M89) & set(F89) & set(T89))
print(len(common_names))
print(list(common_names)[:12])

2152
['Koree', 'Niko', 'Shelby', 'Kellin', 'Daryl', 'Keaton', 'Thai', 'Rafael', 'Sasha', 'Domingue', 'Briana', 'Alejandro']


10.	Write code to output a list of names that show up in both the male and female groups in 1989. How many names show up in both groups for that year?

In [30]:
def gender_neutral_names(year=1989):
    M89 = df[(df.Year == year) & (df.Sex == 'M')].Name
    F89 = df[(df.Year == year) & (df.Sex == 'F')].Name
    names = list(set(M89) & set(F89))
    return names

In [31]:
n = gender_neutral_names(1989)
len(n)

2152

11.	Show how the answer to problem #10 helps explain the answer to problem #9.


In [32]:
common_names = (set(M89) & set(F89) & set(T89))
print(len(common_names))
print(list(common_names)[:12])

2152
['Koree', 'Niko', 'Shelby', 'Kellin', 'Daryl', 'Keaton', 'Thai', 'Rafael', 'Sasha', 'Domingue', 'Briana', 'Alejandro']


### Unisex names
12.	Provide a list of the most common unisex names through 2017. Describe and justify your criteria for a name to be considered unisex. What other criteria could you use for a name to be unisex?

In [33]:
for year in df.Year.unique():
    for sex in ('F', 'M'):
        yidx = df[(df.Year == year) & (df.Sex == sex)].index
        df.at[yidx, 'ann_pctl'] = df.loc[yidx, 'Year'].rank(method='first', pct=True)
        
df.head()

Unnamed: 0,Name,Sex,Number,Year,ann_pctl
0,Mary,F,7065,1880,0.001062
1,Anna,F,2604,1880,0.002123
2,Emma,F,2003,1880,0.003185
3,Elizabeth,F,1939,1880,0.004246
4,Minnie,F,1746,1880,0.005308


In [34]:
def unisex_names(year=1989, threshold=1.0):
    M89 = df[(df.Year == year) & (df.Sex == 'M') & (df.ann_pctl < threshold)].Name
    F89 = df[(df.Year == year) & (df.Sex == 'F') & (df.ann_pctl < threshold)].Name
    names = list(set(M89) & set(F89))
    return names

def most_common_unisex_names(years=[], threshold=1.0):
    countr = Counter()
    if not years:
        years = df.Year.unique()
    for year in years:
        names = unisex_names(year, threshold)
        for name in names:
            countr[name] += 1
    return countr

In [31]:
c95 = most_common_unisex_names(threshold=0.95)
c90 = most_common_unisex_names(threshold=0.90)
c75 = most_common_unisex_names(threshold=0.75)
c50 = most_common_unisex_names(threshold=0.50)

In [32]:
for a, b, c, d in zip(c95.most_common(25), c90.most_common(25), c75.most_common(25), c50.most_common(25)):
    print(a, b, c, d)

('Sidney', 138) ('Sidney', 138) ('Sidney', 138) ('William', 138)
('William', 138) ('William', 138) ('William', 138) ('Jessie', 138)
('Ollie', 138) ('Ollie', 138) ('Jessie', 138) ('Francis', 138)
('Marion', 138) ('Marion', 138) ('Francis', 138) ('Marion', 138)
('Lee', 138) ('Lee', 138) ('Marion', 138) ('James', 138)
('Johnnie', 138) ('Johnnie', 138) ('James', 138) ('Lee', 138)
('Jesse', 138) ('Tommie', 138) ('Jean', 138) ('Johnnie', 137)
('Tommie', 138) ('Leslie', 138) ('Lee', 138) ('Jean', 133)
('Leslie', 138) ('Joseph', 138) ('Leslie', 138) ('John', 132)
('Joseph', 138) ('Jessie', 138) ('Johnnie', 138) ('Leslie', 131)
('Jessie', 138) ('Francis', 138) ('Ollie', 137) ('Charles', 130)
('Francis', 138) ('James', 138) ('Tommie', 137) ('Tommie', 130)
('James', 138) ('Jean', 138) ('Charles', 137) ('Sidney', 128)
('Jean', 138) ('Charles', 137) ('John', 137) ('Ollie', 126)
('Henry', 137) ('Elizabeth', 137) ('Joseph', 136) ('Joseph', 126)
('Charles', 137) ('John', 137) ('Ira', 135) ('Charlie', 

In [33]:
df.head()

Unnamed: 0,Name,Sex,Number,Year,ann_pctl
0,Mary,F,7065,1880,0.001062
1,Anna,F,2604,1880,0.002123
2,Emma,F,2003,1880,0.003185
3,Elizabeth,F,1939,1880,0.004246
4,Minnie,F,1746,1880,0.005308


In [34]:
f = df[df.Sex == 'F']
m = df[df.Sex == 'M']

In [None]:
df[df.Name == 'Mary']

In [35]:
ff = Counter(f.Name.apply(lambda x: x[0]))
fl = Counter(f.Name.apply(lambda x: x[-1]))
ffl = Counter(f.Name.apply(lambda x: x[0] + x[-1]))

mf = Counter(m.Name.apply(lambda x: x[0]))
ml = Counter(m.Name.apply(lambda x: x[-1]))
mfl = Counter(m.Name.apply(lambda x: x[0] + x[-1]))

In [36]:
list(zip([i[0] for i in ffl.most_common(15)], [i[0] for i in mfl.most_common(15)]))

[('Aa', 'Dn'),
 ('La', 'Jn'),
 ('Sa', 'Kn'),
 ('Ma', 'An'),
 ('Ta', 'Cn'),
 ('Da', 'Tn'),
 ('Ca', 'Bn'),
 ('Ka', 'Sn'),
 ('Me', 'Ln'),
 ('Je', 'Je'),
 ('Ce', 'De'),
 ('Ja', 'Rn'),
 ('Ae', 'En'),
 ('Ra', 'Mn'),
 ('Ea', 'Ce')]

In [37]:
df[(df.Year == 1994) & (df.Sex == 'M')].head()

Unnamed: 0,Name,Sex,Number,Year,ann_pctl
1186573,Michael,M,4447,1994,9.8e-05
1186574,Christopher,M,348,1994,0.000195
1186575,Matthew,M,33646,1994,0.000293
1186576,Joshua,M,31372,1994,0.00039
1186577,Tyler,M,30477,1994,0.000488


In [38]:
#df.drop(columns=['Number', 'ann_pct'], inplace=True)

In [39]:
df.head()

Unnamed: 0,Name,Sex,Number,Year,ann_pctl
0,Mary,F,7065,1880,0.001062
1,Anna,F,2604,1880,0.002123
2,Emma,F,2003,1880,0.003185
3,Elizabeth,F,1939,1880,0.004246
4,Minnie,F,1746,1880,0.005308


## Machine learning exercise

Design and create a machine learning model that classifies a name as either male or female based on the characteristics of the name. Write up a report with screenshots in a text document or Jupyter notebook that describes the following:

•	the input data and its quality

•	any data exploration and any interesting insights

•	any preprocessing or cleaning of the data that was needed, including how you chose to label the data (i.e., how you labeled a name male or female as the ground truth for the model)

•	the features used by your model and why you chose them

•	your selection of machine learning algorithm and why it was chosen

•	the accuracy of your model and any relevant metrics that describe model performance

•	any model parameter tuning used to improve performance

•	any relevant discussion, model interpretation, future steps, or further exploration needed

In addition, prepare a short slideshow presentation (~10 min.) of an overview of the machine learning exercise. You will present this to the analytics team.
There are no restrictions on the classification model that you use; however, be sure to keep in mind your ability to adequately explain and interpret the results.
Good luck!


In [35]:
df.Name.str.len().min(), df.Name.str.len().max()

(2, 15)

In [36]:
df.head()

Unnamed: 0,Name,Sex,Number,Year,ann_pctl
0,Mary,F,7065,1880,0.001062
1,Anna,F,2604,1880,0.002123
2,Emma,F,2003,1880,0.003185
3,Elizabeth,F,1939,1880,0.004246
4,Minnie,F,1746,1880,0.005308


In [37]:
y = df.Sex.apply(lambda x: 0 if x=='F' else 1)
#y = df.loc[:, 'Sex']
X = df.loc[:, ['Name', 'Year']]

In [38]:
def cap_last(name):
    nam = name[:-1]
    e = name[-1:].upper()
    return nam + e

#X['Name'] = X.Name.apply(lambda x: cap_last(x) )

In [39]:
X.head()

Unnamed: 0,Name,Year
0,Mary,1880
1,Anna,1880
2,Emma,1880
3,Elizabeth,1880
4,Minnie,1880


In [40]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Sex, dtype: int64

## Simple Model - Character Frequency

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB
from sklearn.linear_model import SGDClassifier, LogisticRegression, LogisticRegressionCV
from sklearn.pipeline import Pipeline

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=33)

In [50]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(1,1))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

model_mnb = MultinomialNB().fit(X_train_counts, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)

model_mnb.score(X_test_counts, y_test)

0.6690333122907103

In [65]:
X_train_counts

<1539732x763325 sparse matrix of type '<class 'numpy.int64'>'
	with 46601657 stored elements in Compressed Sparse Row format>

In [53]:
print(cwb_vectorizer.get_feature_names())

[' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [54]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2,2))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

model_mnb = MultinomialNB().fit(X_train_counts, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)

model_mnb.score(X_test_counts, y_test)

0.7864589422055267

In [59]:
print(cwb_vectorizer.get_feature_names()[:100])

[' a', ' b', ' c', ' d', ' e', ' f', ' g', ' h', ' i', ' j', ' k', ' l', ' m', ' n', ' o', ' p', ' q', ' r', ' s', ' t', ' u', ' v', ' w', ' x', ' y', ' z', 'a ', 'aa', 'ab', 'ac', 'ad', 'ae', 'af', 'ag', 'ah', 'ai', 'aj', 'ak', 'al', 'am', 'an', 'ao', 'ap', 'aq', 'ar', 'as', 'at', 'au', 'av', 'aw', 'ax', 'ay', 'az', 'b ', 'ba', 'bb', 'bc', 'bd', 'be', 'bg', 'bh', 'bi', 'bj', 'bl', 'bm', 'bn', 'bo', 'br', 'bs', 'bt', 'bu', 'bw', 'by', 'c ', 'ca', 'cb', 'cc', 'cd', 'ce', 'cg', 'ch', 'ci', 'cj', 'ck', 'cl', 'cm', 'cn', 'co', 'cp', 'cq', 'cr', 'cs', 'ct', 'cu', 'cx', 'cy', 'cz', 'd ', 'da', 'db']


In [60]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2,8))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

model_mnb = MultinomialNB().fit(X_train_counts, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)

model_mnb.score(X_test_counts, y_test)

0.8870738544110274

In [61]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,8))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

model_mnb = MultinomialNB().fit(X_train_counts, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)

model_mnb.score(X_test_counts, y_test)

0.8887624599605646

In [62]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,8))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

model_mnb = MultinomialNB(alpha=0.001).fit(X_train_counts, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)

model_mnb.score(X_test_counts, y_test)

0.8927969282966126

In [63]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,10))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

model_mnb = MultinomialNB(alpha=0.001).fit(X_train_counts, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)

model_mnb.score(X_test_counts, y_test)

0.8931840086456604

<1539732x763325 sparse matrix of type '<class 'numpy.int64'>'
	with 46601657 stored elements in Compressed Sparse Row format>

In [51]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(1,1))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)

0.6323931697204449

In [195]:
X_train_counts

<1539732x27 sparse matrix of type '<class 'numpy.int64'>'
	with 9767778 stored elements in Compressed Sparse Row format>

In [186]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(1,2))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer(use_idf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)

0.7537207773820379

In [184]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(1,3))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer(use_idf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)

0.8053063779930534

In [197]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(1,4))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)
#0.8391668160433114

0.8663975289206174

In [215]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,3))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)

0.8400007273993136

In [199]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,4))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)
#0.843866335180408

0.8671223303795725

In [219]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,5))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)

0.8816027724305269

In [220]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,6))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)

0.8885884036962276

In [227]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,7))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)

0.8915525558993384

In [234]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,8))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)
#0.8925449363915279

0.8925449363915279

In [235]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,8))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB(alpha=0.01).fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)
#0.8925449363915279  alpha = 1.0
#0.8963300106771828  alpha = 0.01

0.8963300106771828

In [236]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,8))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB(alpha=0.001).fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)
#0.8925449363915279  alpha = 1.0
#0.896091008045556   alpha = 0.1
#0.8963300106771828  alpha = 0.01
#0.896348195660024   alpha = 0.001
#0.8962494771817433   alpha = 0.00001

0.896348195660024

In [257]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,14))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

#tfidf_transformer = TfidfTransformer()
#X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB(alpha=0.001).fit(X_train_counts, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
#X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_counts, y_test)
#0.8925449363915279  alpha = 1.0
#0.896091008045556   alpha = 0.1
#0.8963300106771828  alpha = 0.01
#0.896348195660024   alpha = 0.001
#0.8965768068728843  ngram_range=(2,14)

0.89362304608854

In [274]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,8))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer(smooth_idf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB(alpha=0.001).fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)
#0.8925449363915279  alpha = 1.0
#0.896091008045556   alpha = 0.1
#0.8963300106771828  alpha = 0.01
#0.896348195660024   alpha = 0.001
#0.8965768068728843  ngram_range=(2,14)
#0.893591871832241   No tfidf
###0.8973197933146807 ngram_range=(2,10)
#0.8965794047275759 ngram_range=(2,12), smooth_idf=False
#0.8965222519243609 ngram_range=(2,10), smooth_idf=False

0.8963559892240988

In [277]:
model_mnb.class_count_

array([910303., 629429.])

In [278]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,10))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB(alpha=0.001).fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)
#0.8925449363915279  alpha = 1.0
#0.896091008045556   alpha = 0.1
#0.8963300106771828  alpha = 0.01
#0.896348195660024   alpha = 0.001
#0.896296238566192   alpha = 0.0001
#0.8962494771817433  alpha = 0.00001
#0.8964936755227533  ngram_range=(2,9)

0.8965170562149777

In [237]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,8))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB(alpha=0.1).fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)
#0.8925449363915279  alpha = 1.0
#0.8963300106771828  alpha = 0.01

0.896091008045556

In [233]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,9))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)

0.8928696682279773

In [230]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,10))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)

0.8930151480907067

In [232]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,15))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)

0.8930567137657722

In [None]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,8))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

In [279]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,8))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

model_logr = LogisticRegressionCV(cv=5, random_state=11, max_iter=200, n_jobs=-1)
model_logr.fit(X_train_counts, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)

model_logr.score(X_test_counts, y_test)
# 0.89368799245583 << cv=3
# 0.8934411962601284 << cv=5
# 0.9005385352775678 << counts, ngram_range=(2,8)



0.9005385352775678

In [212]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(1,1))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_logr2 = LogisticRegressionCV(cv=5, random_state=11, max_iter=200, n_jobs=-1)
model_logr2.fit(X_train_tfidf, y_train)
X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_logr2.score(X_test_tfidf, y_test)

0.7502084778390006

In [214]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(1,2))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_logr2 = LogisticRegressionCV(cv=5, random_state=11, max_iter=200, n_jobs=-1)
model_logr2.fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_logr2.score(X_test_tfidf, y_test)

0.8241330309430472

In [None]:
# parameters = {
#     'vect__max_df': (0.5, 0.75, 1.0),
#     # 'vect__max_features': (None, 5000, 10000, 50000),
#     'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
#     # 'tfidf__use_idf': (True, False),
#     # 'tfidf__norm': ('l1', 'l2'),
#     'clf__max_iter': (5,),
#     'clf__alpha': (0.00001, 0.000001),
#     'clf__penalty': ('l2', 'elasticnet'),
#     # 'clf__max_iter': (10, 50, 80),
# }

In [54]:
%%time
mnb_pipe = Pipeline([
    ('vect', CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,8))),
    ('clf', MultinomialNB(alpha=0.001)),
])

mnb_pipe.fit(X_train.Name, y_train)

CPU times: user 37.4 s, sys: 1.41 s, total: 38.8 s
Wall time: 37.2 s


In [55]:
mnb_pipe.score(X_test.Name, y_test)

0.8927969282966126

In [None]:
%%time
mnb_pipe2 = Pipeline([
    ('vect', CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,8))),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB(alpha=0.001)),
])

mnb_pipe2.fit(X_train.Name, y_train)

In [None]:
mnb_pipe2.score(X_test.Name, y_test)

In [56]:
%%time
cnb_pipe = Pipeline([
    ('vect', CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,8))),
    ('clf', ComplementNB(alpha=0.001)),
])

cnb_pipe.fit(X_train.Name, y_train)

CPU times: user 38.7 s, sys: 1.45 s, total: 40.2 s
Wall time: 38 s


In [57]:
cnb_pipe.score(X_test.Name, y_test)

0.8926384591604253

In [58]:
%%time
sgd_pipe = Pipeline([
    ('vect', CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,8))),
    ('clf', SGDClassifier(loss='log', penalty='l2',
                          alpha=0.001, random_state=11,
                          max_iter=100, tol=1e-3)),
])

sgd_pipe.fit(X_train.Name, y_train)

CPU times: user 46.2 s, sys: 1.52 s, total: 47.7 s
Wall time: 45.1 s


In [59]:
sgd_pipe.score(X_test.Name, y_test)

0.8472513398435572

In [60]:
%%time
logrcv_pipe = Pipeline([
    ('vect', CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,8))),
    ('logrCV', LogisticRegressionCV(cv=5, random_state=11, max_iter=1000, n_jobs=-1)),
])

logrcv_pipe.fit(X_train.Name, y_train)

CPU times: user 17min 49s, sys: 3.12 s, total: 17min 52s
Wall time: 52min 20s


In [61]:
logrcv_pipe.score(X_test.Name, y_test)

0.9006684280121475

In [None]:
%%time
logrcv_pipe2 = Pipeline([
    ('vect', CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2,8))),
    ('logrCV', LogisticRegressionCV(cv=5, solver='sag', random_state=11, max_iter=1000, n_jobs=-1)),
])

logrcv_pipe2.fit(X_train.Name, y_train)

In [None]:
logrcv_pipe2.score(X_test.Name, y_test)

In [226]:
import numpy as np
from sklearn.model_selection import KFold

#X = ["a", "b", "c", "d"]
kf = KFold(2)
for train, test in kf.split(X):
    print("%s %s" % (train, test))


[ 962333  962334  962335 ... 1924662 1924663 1924664] [     0      1      2 ... 962330 962331 962332]
[     0      1      2 ... 962330 962331 962332] [ 962333  962334  962335 ... 1924662 1924663 1924664]


In [211]:
X_test_counts

<384933x83398 sparse matrix of type '<class 'numpy.int64'>'
	with 7124730 stored elements in Compressed Sparse Row format>

In [188]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(3,4))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer(use_idf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)

0.848550267189355

In [175]:
model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

{'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': False, 'use_idf': True}

In [164]:
X_train_counts

<1539732x27 sparse matrix of type '<class 'numpy.int64'>'
	with 9767778 stored elements in Compressed Sparse Row format>

In [149]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(1, 1))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

predicted = model_mnb.predict(X_test_tfidf)

In [150]:
model_mnb.score(X_test_tfidf, y_test)

0.6323931697204449

In [132]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(1, 2))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)

0.7593269478065013

In [151]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(1, 2))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)

0.8024929013620552

In [152]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(2, 2))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)

0.8040516141770127

In [153]:
cwb_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)

0.7668970963778112

In [154]:
cwb_vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 1))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)

0.6359184585369402

In [155]:
cwb_vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 2))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)

0.704418691044935

In [156]:
cwb_vectorizer = CountVectorizer(analyzer='char', lowercase=False, ngram_range=(1, 2))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)

0.7977596101139679

In [136]:
cwb_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 2))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)

0.7042212540883738

In [157]:
cwb_vectorizer = CountVectorizer(analyzer='char', lowercase=False, ngram_range=(2, 2))
X_train_counts = cwb_vectorizer.fit_transform(X_train.Name)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

model_mnb = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_counts = cwb_vectorizer.transform(X_test.Name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

model_mnb.score(X_test_tfidf, y_test)

0.7984922051369978

In [65]:
# c_vectorizer = CountVectorizer(analyzer='char', lowercase=False, ngram_range=(2, 2))
# X_vect_c = c_vectorizer.fit_transform(X.Name)

0.6323931697204449