### Import Required Modules

In [None]:
# ../CBSEData/cbse_gender_names_list.csv

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import pickle

In [4]:
import sys  
sys.path.insert(0, '../PreProcessing/')

from utils import *
from parse_df import *
from split_name import *

### Train Model on Data

In [39]:
cbse_df = getCBSEData(False)
cbse_df = cleanDf(cbse_df, 'Name')

Gender
Boy     114088
Girl     91456
Name: Name, dtype: int64


In [None]:
train, val, test = splitTrainTestVal(cbse_df, 0.8 ,0, 'Name')

In [None]:
sgd_pipe = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 6), lowercase = True, stop_words = "english")),
    ('tfidf', TfidfTransformer(sublinear_tf = True)),
    ('clf', SGDClassifier(class_weight="balanced",loss="squared_loss",max_iter=10000, random_state=None, validation_fraction = 0.3, average=1000))
])

In [None]:
sgd_pipe.fit(train['Name'].values.astype('str'), train['Gender'].values.astype('str'))

In [None]:
y_pred=sgd_pipe.predict(test['Name'].values.astype('str'))
print(classification_report(test['Gender'], y_pred, target_names = ['Male', 'Female']))

### Save Model

In [None]:
filename = "SavedModels/SGDClassifier/CBSEModel.pkl"
pickle.dump(sgd_pipe, open(filename, 'wb'))

### Testing the above Model on ElectoralRolls Dataset

In [38]:
filename = "SavedModels/SGDClassifier/CBSEModel.pkl"
pipe = pickle.load(open(filename, 'rb'))

In [41]:
y_pred = pipe.predict(cbse_df['Name'].values.astype('str'))
print(classification_report(cbse_df['Gender'].values.astype('str'), y_pred, target_names = ['Male', 'Female'] ))

              precision    recall  f1-score   support

        Male       0.91      0.94      0.92    114089
      Female       0.92      0.88      0.90     91456

    accuracy                           0.91    205545
   macro avg       0.91      0.91      0.91    205545
weighted avg       0.91      0.91      0.91    205545



In [34]:
def testData(df, model):
    df = preprocessData(df)
    print("-------------------STATS OF DATA-----------------------")
    print("SHAPE: ", df.shape)
    print(df.groupby('Gender')['Name'].count())
    print("-------------------------------------------------------")
    df['Gender'] = df['Gender'].replace({'FEMALE':'Girl', 'MALE':'Boy'})
    print("------------------RESULTS-------------------------------")
    y_pred = model.predict(df['Name'].values.astype('str'))
    print(classification_report(df['Gender'].values.astype('str'), y_pred, target_names = ['Male', 'Female'] ))
    print("--------------------------------------------------------")

In [35]:
def stateResults(path, pipe):
    df = pd.read_csv(path, dtype={'name':str})
    testData(df, pipe)

In [37]:
stateResults(DAMAN_CSV, pipe)

-------------------STATS OF DATA-----------------------
SHAPE:  (164362, 2)
Gender
FEMALE     54188
MALE      110174
Name: Name, dtype: int64
-------------------------------------------------------
------------------RESULTS-------------------------------
              precision    recall  f1-score   support

        Male       0.79      0.74      0.76    110174
      Female       0.53      0.61      0.57     54188

    accuracy                           0.69    164362
   macro avg       0.66      0.67      0.67    164362
weighted avg       0.71      0.69      0.70    164362

--------------------------------------------------------


In [11]:
stateResults(MANIPUR_CSV, pipe)

-------------------STATS OF DATA-----------------------
SHAPE:  (1689592, 4)
Gender
FEMALE     617321
MALE      1072271
Name: Name, dtype: int64
-------------------------------------------------------
------------------RESULTS-------------------------------
              precision    recall  f1-score   support

        Male       0.91      0.35      0.50   1072271
      Female       0.45      0.94      0.61    617321

    accuracy                           0.56   1689592
   macro avg       0.68      0.64      0.56   1689592
weighted avg       0.75      0.56      0.54   1689592

--------------------------------------------------------


In [24]:
stateResults(MEGHALAYA_CSV, pipe)

  stateResults(MEGHALAYA_CSV, pipe)


-------------------STATS OF DATA-----------------------
SHAPE:  (2113858, 4)
Gender
FEMALE     844463
MALE      1269395
Name: Name, dtype: int64
-------------------------------------------------------
------------------RESULTS-------------------------------
              precision    recall  f1-score   support

        Male       0.83      0.18      0.29   1269395
      Female       0.43      0.95      0.60    844463

    accuracy                           0.49   2113858
   macro avg       0.63      0.56      0.44   2113858
weighted avg       0.67      0.49      0.41   2113858

--------------------------------------------------------


In [13]:
stateResults(NAGALAND_CSV, pipe)

-------------------STATS OF DATA-----------------------
SHAPE:  (816488, 4)
Gender
FEMALE    265382
MALE      551106
Name: Name, dtype: int64
-------------------------------------------------------
------------------RESULTS-------------------------------
              precision    recall  f1-score   support

        Male       0.92      0.18      0.30    551106
      Female       0.36      0.97      0.53    265382

    accuracy                           0.43    816488
   macro avg       0.64      0.57      0.41    816488
weighted avg       0.74      0.43      0.37    816488

--------------------------------------------------------


In [14]:
stateResults(ARUNACHAL_CSV, pipe)

-------------------STATS OF DATA-----------------------
SHAPE:  (815744, 4)
Gender
FEMALE    300756
MALE      514987
Name: Name, dtype: int64
-------------------------------------------------------
------------------RESULTS-------------------------------
              precision    recall  f1-score   support

        Male       0.84      0.24      0.37    514988
      Female       0.41      0.92      0.57    300756

    accuracy                           0.49    815744
   macro avg       0.63      0.58      0.47    815744
weighted avg       0.68      0.49      0.45    815744

--------------------------------------------------------


In [15]:
stateResults(SIKKIM_CSV, pipe)

-------------------STATS OF DATA-----------------------
SHAPE:  (239927, 4)
Gender
FEMALE     79105
MALE      160822
Name: Name, dtype: int64
-------------------------------------------------------
------------------RESULTS-------------------------------
              precision    recall  f1-score   support

        Male       0.88      0.62      0.73    160822
      Female       0.51      0.82      0.63     79105

    accuracy                           0.69    239927
   macro avg       0.70      0.72      0.68    239927
weighted avg       0.76      0.69      0.69    239927

--------------------------------------------------------


In [17]:
goa_df = getStateData(listdir(GOA),GOA)
testData(goa_df, pipe)

-------------------STATS OF DATA-----------------------
SHAPE:  (517013, 4)
Gender
FEMALE    189232
MALE      327781
Name: Name, dtype: int64
-------------------------------------------------------
------------------RESULTS-------------------------------
              precision    recall  f1-score   support

        Male       0.83      0.54      0.65    327781
      Female       0.50      0.81      0.62    189232

    accuracy                           0.64    517013
   macro avg       0.67      0.67      0.64    517013
weighted avg       0.71      0.64      0.64    517013

--------------------------------------------------------


In [18]:
goa_df.to_csv(GOA_CSV)

In [20]:
stateResults(MIZORAM_CSV, pipe)

-------------------STATS OF DATA-----------------------
SHAPE:  (429776, 4)
Gender
FEMALE    139569
MALE      290207
Name: Name, dtype: int64
-------------------------------------------------------
------------------RESULTS-------------------------------
              precision    recall  f1-score   support

        Male       0.89      0.15      0.26    290207
      Female       0.35      0.96      0.52    139569

    accuracy                           0.42    429776
   macro avg       0.62      0.56      0.39    429776
weighted avg       0.72      0.42      0.35    429776

--------------------------------------------------------


In [21]:
stateResults(DELHI_CSV, pipe)

-------------------STATS OF DATA-----------------------
SHAPE:  (3801952, 4)
Gender
FEMALE    1041925
MALE      2760027
Name: Name, dtype: int64
-------------------------------------------------------
------------------RESULTS-------------------------------
              precision    recall  f1-score   support

        Male       0.91      0.79      0.84   2760027
      Female       0.58      0.78      0.67   1041925

    accuracy                           0.79   3801952
   macro avg       0.74      0.79      0.76   3801952
weighted avg       0.82      0.79      0.80   3801952

--------------------------------------------------------
