### Import Required Modules

In [None]:
# ../CBSEData/cbse_gender_names_list.csv

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import pickle

In [2]:
import sys  
sys.path.insert(0, '../PreProcessing/')

from utils import *
from parse_df import *
from split_name import *

### Train Model on Data

In [3]:
cbse_df = getCBSEData(False)
cbse_df = cleanDf(cbse_df, 'Name')

Gender
Boy     114088
Girl     91456
Name: Name, dtype: int64


In [13]:
train, val, test = splitTrainTestVal(cbse_df, 0.7 ,0, 'Name')

(train:(143977, 5), val:(0, 0), test:(61568, 5))


In [14]:
total_points = cbse_df.shape[0]
train_points = train.shape[0]
test_points = test.shape[0]
print(train_points/total_points)

0.70046461845338


In [15]:
sgd_pipe = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 6), lowercase = True, stop_words = "english")),
    ('tfidf', TfidfTransformer(sublinear_tf = True)),
    ('clf', SGDClassifier(class_weight="balanced",loss="squared_loss",max_iter=10000, random_state=None, validation_fraction = 0.3, average=1000))
])

In [16]:
sgd_pipe.fit(train['Name'].values.astype('str'), train['Gender'].values.astype('str'))

Pipeline(steps=[('vect',
                 CountVectorizer(ngram_range=(1, 6), stop_words='english')),
                ('tfidf', TfidfTransformer(sublinear_tf=True)),
                ('clf',
                 SGDClassifier(average=1000, class_weight='balanced',
                               loss='squared_loss', max_iter=10000,
                               validation_fraction=0.3))])

In [17]:
y_pred=sgd_pipe.predict(test['Name'].values.astype('str'))
print(classification_report(test['Gender'], y_pred, target_names = ['Male', 'Female']))

              precision    recall  f1-score   support

        Male       0.71      0.76      0.73     33720
      Female       0.68      0.62      0.65     27848

    accuracy                           0.69     61568
   macro avg       0.69      0.69      0.69     61568
weighted avg       0.69      0.69      0.69     61568



### Save Model

In [18]:
filename = "SavedModels/FinalSet/SGDClassifier/CBSEModel.pkl"
pickle.dump(sgd_pipe, open(filename, 'wb'))

### Load Model

In [4]:
filename = "SavedModels/FinalSet/SGDClassifier/CBSEModel.pkl"
pipe = pickle.load(open(filename, 'rb'))

In [5]:
df, first_name = split_name_df(cbse_df, 'Name')

In [6]:
y_pred=pipe.predict(df['first_name'].values.astype('str'))
# print(classification_report(test['Gender'], y_pred, target_names = ['Male', 'Female']))

In [7]:
fn = pd.DataFrame()

In [8]:
fn['first_name'] = df['first_name']

In [9]:
df['predict'] = y_pred

In [10]:
fn['gender'] = y_pred
fn['true_gender'] = df['Gender']

In [28]:
print(fn.groupby(['first_name','gender']).size())

first_name    gender
(late)        Boy         1
(late)robert  Boy         1
`rameshwar    Boy         1
a             Girl      293
aabda         Girl        1
                       ... 
zulfiya       Girl        1
zulikha       Girl        1
zumardan      Girl        1
zunjhar       Girl        1
zuveda        Girl        1
Length: 23597, dtype: int64


In [52]:
f1 = fn.drop_duplicates()
f1 = f1.reset_index(drop=True)
f1

Unnamed: 0,first_name,gender,true_gender
0,sarda,Girl,Girl
1,rakhi,Girl,Girl
2,surekha,Girl,Girl
3,shivani,Girl,Girl
4,ramdulari,Girl,Girl
...,...,...,...
25729,mannulal,Girl,Boy
25730,anada,Girl,Boy
25731,priyadarshan,Girl,Boy
25732,bindhyeshwari,Girl,Boy
