## Gender Classification Of Names
### Using Machine Learning To Detect/Predict Gender of Individuals 
+ Sklearn
+ Pandas
+ Text Extraction

In [1]:
# packages
import pandas as pd
import numpy as np

In [2]:
# ML Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer


In [91]:
# Load our data
df = pd.read_csv('data/names_dataset.csv')

In [97]:
df.tail()

Unnamed: 0,index,name,sex
95020,1858664,Zecharya,M
95021,1858676,Ziheng,M
95022,1858679,Ziyu,M
95023,1858686,Zykir,M
95024,1858688,Zyus,M


In [94]:
df.size

285075

In [6]:
# Data Cleaning
# Checking for column name consistency
df.columns

Index(['index', 'name', 'sex'], dtype='object')

In [7]:
# Data Types
df.dtypes

index     int64
name     object
sex      object
dtype: object

In [8]:
# Checking for Missing Values
df.isnull().isnull().sum()

index    0
name     0
sex      0
dtype: int64

In [9]:
# Number of Female Names
df[df.sex == 'F'].size

181800

In [10]:
# Number of Male Names
df[df.sex == 'M'].size

103275

In [11]:
df_names = df

In [12]:
# Replacing All F and M with 0 and 1 respectively
df_names.sex.replace({'F':0,'M':1},inplace=True)

In [13]:
df_names.sex.unique()

array([0, 1], dtype=int64)

In [14]:
df_names.dtypes

index     int64
name     object
sex       int64
dtype: object

In [15]:
Xfeatures =df_names['name']

In [16]:
# Feature Extraction 
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures)

In [17]:
cv.get_feature_names()

['aaban',
 'aabha',
 'aabid',
 'aabriella',
 'aada',
 'aadam',
 'aadan',
 'aadarsh',
 'aaden',
 'aadesh',
 'aadhav',
 'aadhavan',
 'aadhi',
 'aadhira',
 'aadhvik',
 'aadhya',
 'aadhyan',
 'aadi',
 'aadian',
 'aadil',
 'aadin',
 'aadish',
 'aadison',
 'aadit',
 'aadith',
 'aadithya',
 'aaditri',
 'aaditya',
 'aadiv',
 'aadon',
 'aadrian',
 'aadrika',
 'aadrit',
 'aadvik',
 'aadvika',
 'aadya',
 'aadyn',
 'aafia',
 'aafreen',
 'aagam',
 'aage',
 'aagot',
 'aahaan',
 'aahan',
 'aahana',
 'aahil',
 'aahir',
 'aahliyah',
 'aahna',
 'aahron',
 'aaidan',
 'aaiden',
 'aaidyn',
 'aaila',
 'aailiyah',
 'aailyah',
 'aaima',
 'aaira',
 'aairah',
 'aaisha',
 'aaishah',
 'aaiyana',
 'aaiza',
 'aaja',
 'aajah',
 'aajaylah',
 'aajon',
 'aakanksha',
 'aakarsh',
 'aakash',
 'aakeem',
 'aakilah',
 'aakira',
 'aakiyah',
 'aakriti',
 'aala',
 'aalaiya',
 'aalaiyah',
 'aalana',
 'aalanah',
 'aalani',
 'aalap',
 'aalaya',
 'aalayah',
 'aalayiah',
 'aalayjah',
 'aalayna',
 'aalaysha',
 'aalaysia',
 'aalea',
 

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
# Features 
X
# Labels
y = df_names.sex

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [21]:
# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)


0.6398163206734908

In [22]:
# Accuracy of our Model
print("Accuracy of Model",clf.score(X_test,y_test)*100,"%")

Accuracy of Model 63.98163206734908 %


In [23]:
# Accuracy of our Model
print("Accuracy of Model",clf.score(X_train,y_train)*100,"%")

Accuracy of Model 100.0 %


### Sample Prediction

In [42]:
sample = pd.read_excel("FinalProjectDataSample.xlsx")

In [41]:
sample

Unnamed: 0,ID,Name,Class,FirstVisit,Visit,LengthOfStay,FileViews,FileViewRate,VideosViews,VideoViewRate,HomeworkSubmit,HomeworkSubmitRate,OnlineQuizCompletions,OnlineQuizCompletionRate,Forum(number of articles / replies),ForumParticipationRate,RollCall (attendance / absence / leave)
0,A01,LST,A,2018.09.14 19:06,376,18:16:50,48,0.875,36,0.4167,11,1.0,1,1,1 ( 1 / 0 ),0.3333,4 / 0 / 0
1,A02,LTY,A,2018.09.15 14:18,366,15:46:09,21,0.5833,32,0.2083,10,1.0,1,1,1 ( 1 / 0 ),0.3333,4 / 0 / 0
2,A03,LCT,A,2018.09.15 14:50,67,03:21:40,4,0.3333,1,0.0417,10,1.0,0,0,1 ( 1 / 0 ),0.3333,1 / 3 / 0
3,A04,CCX,A,2018.09.15 14:50,63,03:39:04,2,0.0833,2,0.0417,10,1.0,0,0,2 ( 1 / 1 ),0.3333,1 / 3 / 0
4,A05,TGC,A,2018.09.15 14:22,111,04:43:06,11,0.375,7,0.0833,10,1.0,0,0,1 ( 1 / 0 ),0.3333,1 / 3 / 0
5,A06,LSR,A,2018.09.14 18:57,588,1900-01-01 00:42:20,42,1.0,62,0.7083,13,1.0,1,1,1 ( 1 / 0 ),0.3333,4 / 0 / 0
6,A07,LGR,A,2018.09.15 14:16,352,13:00:19,32,0.625,28,0.2083,10,1.0,1,1,1 ( 1 / 0 ),0.3333,4 / 0 / 0
7,A08,LYN,A,2018.09.15 14:19,482,22:16:56,34,0.9167,44,0.4583,10,1.0,1,1,1 ( 1 / 0 ),0.3333,4 / 0 / 0
8,A09,CCW,A,2018.09.15 14:22,274,11:35:55,12,0.375,29,0.375,10,1.0,1,1,2 ( 1 / 1 ),0.3333,3 / 1 / 0
9,A10,CLCC,A,2018.09.15 14:23,170,09:43:45,11,0.375,16,0.2083,10,1.0,1,1,2 ( 1 / 1 ),0.3333,4 / 0 / 0


In [43]:
vect = cv.transform(sample["Name"]).toarray()

In [61]:
# Female is 0, Male is 1
a=clf.predict(vect)
a

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [84]:
ddf = pd.DataFrame({"Gender":a})


In [85]:
ddf.replace({0:'Female',1:'Male'},inplace=True)

In [86]:
ddf

Unnamed: 0,Gender
0,Female
1,Female
2,Female
3,Female
4,Female
5,Female
6,Female
7,Male
8,Female
9,Female


In [87]:
df_new = pd.concat([sample, ddf], axis=1)

In [88]:
df_new

Unnamed: 0,ID,Name,Class,FirstVisit,Visit,LengthOfStay,FileViews,FileViewRate,VideosViews,VideoViewRate,HomeworkSubmit,HomeworkSubmitRate,OnlineQuizCompletions,OnlineQuizCompletionRate,Forum(number of articles / replies),ForumParticipationRate,RollCall (attendance / absence / leave),Gender
0,A01,LST,A,2018.09.14 19:06,376,18:16:50,48,0.875,36,0.4167,11,1.0,1,1,1 ( 1 / 0 ),0.3333,4 / 0 / 0,Female
1,A02,LTY,A,2018.09.15 14:18,366,15:46:09,21,0.5833,32,0.2083,10,1.0,1,1,1 ( 1 / 0 ),0.3333,4 / 0 / 0,Female
2,A03,LCT,A,2018.09.15 14:50,67,03:21:40,4,0.3333,1,0.0417,10,1.0,0,0,1 ( 1 / 0 ),0.3333,1 / 3 / 0,Female
3,A04,CCX,A,2018.09.15 14:50,63,03:39:04,2,0.0833,2,0.0417,10,1.0,0,0,2 ( 1 / 1 ),0.3333,1 / 3 / 0,Female
4,A05,TGC,A,2018.09.15 14:22,111,04:43:06,11,0.375,7,0.0833,10,1.0,0,0,1 ( 1 / 0 ),0.3333,1 / 3 / 0,Female
5,A06,LSR,A,2018.09.14 18:57,588,1900-01-01 00:42:20,42,1.0,62,0.7083,13,1.0,1,1,1 ( 1 / 0 ),0.3333,4 / 0 / 0,Female
6,A07,LGR,A,2018.09.15 14:16,352,13:00:19,32,0.625,28,0.2083,10,1.0,1,1,1 ( 1 / 0 ),0.3333,4 / 0 / 0,Female
7,A08,LYN,A,2018.09.15 14:19,482,22:16:56,34,0.9167,44,0.4583,10,1.0,1,1,1 ( 1 / 0 ),0.3333,4 / 0 / 0,Male
8,A09,CCW,A,2018.09.15 14:22,274,11:35:55,12,0.375,29,0.375,10,1.0,1,1,2 ( 1 / 1 ),0.3333,3 / 1 / 0,Female
9,A10,CLCC,A,2018.09.15 14:23,170,09:43:45,11,0.375,16,0.2083,10,1.0,1,1,2 ( 1 / 1 ),0.3333,4 / 0 / 0,Female


In [90]:
df_new.to_excel("writer.xlsx", sheet_name="sheet1")

##### Save Multinomial NB Model

In [61]:
NaiveBayesModel = open("naivebayesgendermodel.pkl","wb")

In [62]:
joblib.dump(clf,NaiveBayesModel)

In [63]:
NaiveBayesModel.close()