### Import Required Modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import pickle

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
!nvidia-smi

Fri Mar 18 07:58:52 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.86       Driver Version: 470.86       CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:3B:00.0 Off |                    0 |
| N/A   43C    P0    92W / 250W |  16048MiB / 16280MiB |    100%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE...  Off  | 00000000:D8:00.0 Off |                    0 |
| N/A   52C    P0   100W / 250W |  10994MiB / 16280MiB |    100%      Default |
|       

In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"    
import tensorflow as tf

In [5]:
PATH_TO_FOLDER = "drive/MyDrive/Name2Demographics/"

In [6]:
# %cd $PATH_TO_FOLDER
%cd ../../../

/home/souvic/mounted/btp/vahini/Name2Demographics


In [7]:
import sys  
sys.path.insert(0, 'Models/PreProcessing/')

from utils import *

sys.path.insert(0, 'Models/AIEEEDataState/Preprocessing/')
from aieee_caste import *

### Train Model on Data

In [8]:
caste_df = AIEEECasteStateFinalData()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category'] = df['category'].replace('', np.NaN)


In [9]:
caste_df.head()

Unnamed: 0,Name,Caste
0,gauri shankar pandey 35,0.0
1,nelluri teja bharath 14,0.0
2,jerin t jerome 04,1.0
3,bhuri gurjar 16,1.0
4,abhishek deopa 27,0.0


In [10]:
train, val, test = splitTrainTestVal(caste_df, 0.7 ,0, 'Name')

(train:(187544, 4), val:(0, 0), test:(83509, 4))


In [11]:
total_points = caste_df.shape[0]
train_points = train.shape[0]
test_points = test.shape[0]
print(train_points/total_points)

0.6919089624538375


In [12]:
sgd_pipe = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 6), lowercase = True, stop_words = "english")),
    ('tfidf', TfidfTransformer(sublinear_tf = True)),
    ('clf', SGDClassifier(class_weight="balanced",loss="squared_loss",max_iter=10000, random_state=42, validation_fraction = 0.3, average=1000
                          , learning_rate='adaptive', eta0=0.003))
])

In [13]:
sgd_pipe.fit(train['Name'].values.astype('str'), train['Caste'])

Pipeline(steps=[('vect',
                 CountVectorizer(ngram_range=(1, 6), stop_words='english')),
                ('tfidf', TfidfTransformer(sublinear_tf=True)),
                ('clf',
                 SGDClassifier(average=1000, class_weight='balanced',
                               eta0=0.003, learning_rate='adaptive',
                               loss='squared_loss', max_iter=10000,
                               random_state=42, validation_fraction=0.3))])

In [14]:
y_pred=sgd_pipe.predict(test['Name'].values.astype('str'))
print(classification_report(test['Caste'], y_pred, target_names = ['Reserved', 'General'], digits=4))

              precision    recall  f1-score   support

    Reserved     0.8073    0.6279    0.7064     48599
     General     0.6044    0.7914    0.6853     34910

    accuracy                         0.6962     83509
   macro avg     0.7058    0.7096    0.6958     83509
weighted avg     0.7225    0.6962    0.6976     83509



In [17]:
train.head()

Unnamed: 0,Name,Caste,first_name,last_name
1,nelluri teja bharath 14,0.0,nelluri,teja bharath 14
2,jerin t jerome 04,1.0,jerin,t jerome 04
3,bhuri gurjar 16,1.0,bhuri,gurjar 16
4,abhishek deopa 27,0.0,abhishek,deopa 27
6,shamseer m 04,1.0,shamseer,m 04


### Save Model

In [15]:
%pwd

'/home/souvic/mounted/btp/vahini/Name2Demographics'

In [16]:
filename = "Models/AIEEEDataState/LR/SavedModel/AIEEEModel.pkl"
pickle.dump(sgd_pipe, open(filename, 'wb'))

### Load Model

In [None]:
# filename = "SavedModels/FinalSet/SGDClassifier/CBSEModel.pkl"
# pipe = pickle.load(open(filename, 'rb'))

In [None]:
# df, first_name = split_name_df(cbse_df, 'Name')

In [None]:
# y_pred=pipe.predict(df['first_name'].values.astype('str'))
# print(classification_report(test['Gender'], y_pred, target_names = ['Male', 'Female']))

In [None]:
# fn = pd.DataFrame()

In [None]:
# fn['first_name'] = df['first_name']

In [None]:
# df['predict'] = y_pred

In [None]:
# fn['gender'] = y_pred
# fn['true_gender'] = df['Gender']

In [None]:
# print(fn.groupby(['first_name','gender']).size())

In [None]:
# f1 = fn.drop_duplicates()
# f1 = f1.reset_index(drop=True)
# f1

Unnamed: 0,first_name,gender,true_gender
0,sarda,Girl,Girl
1,rakhi,Girl,Girl
2,surekha,Girl,Girl
3,shivani,Girl,Girl
4,ramdulari,Girl,Girl
...,...,...,...
25729,mannulal,Girl,Boy
25730,anada,Girl,Boy
25731,priyadarshan,Girl,Boy
25732,bindhyeshwari,Girl,Boy
