In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
import torch
import numpy as np

In [0]:
df = pd.read_csv("/content/drive/My Drive/names/newfile.csv", encoding="utf-8")

In [16]:
df

Unnamed: 0,Name,Gender
0,Aamir,Male
1,Aaron,Male
2,Abbey,Male
3,Abbie,Male
4,Abbot,Male
...,...,...
7939,Zorine,Female
7940,Zsa Zsa,Female
7941,Zsazsa,Female
7942,Zulema,Female


In [8]:
df

Unnamed: 0,Aamir,Abagael
0,,
1,,
2,,
3,,
4,,
...,...,...
4995,,
4996,,
4997,,
4998,,


In [17]:
len(df)

7944

In [0]:
import random

df = df.drop_duplicates(subset=['Name'],
                        keep=random.choice(['first', 'last']))

In [19]:
df

Unnamed: 0,Name,Gender
0,Aamir,Male
1,Aaron,Male
2,Abbey,Male
3,Abbie,Male
4,Abbot,Male
...,...,...
7939,Zorine,Female
7940,Zsa Zsa,Female
7941,Zsazsa,Female
7942,Zulema,Female


In [0]:
from sklearn.preprocessing import LabelEncoder

In [21]:
le = LabelEncoder()
df["Gender"] = le.fit_transform(df["Gender"])
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Name,Gender
0,Aamir,1
1,Aaron,1
2,Abbey,1
3,Abbie,1
4,Abbot,1


In [0]:
genders = ["Female", "Male"]

In [0]:
import string
all_letters = string.ascii_letters + " ./;'"

In [24]:
n_letters = len(all_letters)
all_letters

"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ./;'"

In [0]:
import torch

def name_to_tensor(name):
  name_in_tensor = torch.zeros(len(name), 1, n_letters)
  for i, letter in enumerate(name):
    name_in_tensor[i][0][all_letters.find(letter)] = 1
  return name_in_tensor

In [28]:
name_to_tensor('a')

tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]]])

In [29]:
name_to_tensor('aA')

tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]]])

In [0]:
import torch.nn as nn

class RNN(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(RNN, self).__init__()

    self.hidden_size = hidden_size

    self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
    self.i2o = nn.Linear(input_size + hidden_size, output_size)
    self.softmax = nn.LogSoftmax(dim=1)

  def forward(self, input, hidden):
    combined = torch.cat((input, hidden), 1)
    hidden = self.i2h(combined)
    output = self.i2o(combined)
    output = self.softmax(output)
    return output, hidden
  def initHidden(self):
    return torch.zeros(1, self.hidden_size)

In [0]:
n_hidden = 128
n_genders = len(genders)

rnn = RNN(n_letters, n_hidden, output_size=n_genders)

In [0]:
iterations = 100000

criterion = nn.NLLLoss()

learning_rate = 0.005

In [0]:
def output_to_gender(output):
  top_n, top_index = output.topk(1)
  pred_i = top_index[0].item()
  pred = genders[pred_i]
  
  return pred

In [0]:
for iteration in range(1, iterations+1):
  i = random.randint(0, len(df) - 1)
  name = df.iloc[i][0]
  name_in_tensor = name_to_tensor(name)


  gender = df.iloc[i][1]
  gender_in_tensor = torch.LongTensor([gender])

  hidden = rnn.initHidden()
  rnn.zero_grad()

  for i in range(name_in_tensor.size()[0]):
    output, hidden = rnn(name_in_tensor[i], hidden)
  loss = criterion(output, gender_in_tensor)
  loss.backward()

  for p in rnn.parameters():
    p.data.add_(-learning_rate, p.grad.data)

  if iteration % 5000 == 0:
    pred = output_to_gender(output)
    correct = 'yes' if pred == genders[gender] else 'no (%s)' % genders[gender]

In [0]:
n_confusion = 10000

prediction = []
actual = []

for _ in range(n_confusion):
  i = random.randint(0, len(df)-1)
  name = df.iloc[i][0]
  name_in_tensor = name_to_tensor(name)

  gender_idx = df.iloc[i][1]
  gender_in_tensor = torch.LongTensor([gender_idx])

  hidden = rnn.initHidden()

  for j in range(name_in_tensor.size()[0]):
    output, hidden = rnn(name_in_tensor[j], hidden)
  pred = output_to_gender(output)

In [0]:
prediction.append(pred)
actual.append(genders[gender_idx])

In [45]:
!pip install pandas_ml

Collecting pandas_ml
[?25l  Downloading https://files.pythonhosted.org/packages/ae/72/6d90debfcb9ea74ec00927fa7ed0204dcc560b1f9ffcd8b239daa7fd106d/pandas_ml-0.6.1-py3-none-any.whl (100kB)
[K     |████████████████████████████████| 102kB 1.3MB/s 
[?25hCollecting enum34
  Downloading https://files.pythonhosted.org/packages/63/f6/ccb1c83687756aeabbf3ca0f213508fcfb03883ff200d201b3a4c60cedcc/enum34-1.1.10-py3-none-any.whl
Installing collected packages: enum34, pandas-ml
Successfully installed enum34-1.1.10 pandas-ml-0.6.1


In [0]:
import numpy as np
from sklearn.metrics import classification_report

In [0]:
np_pred = np.array(prediction)
np_actual = np.array(actual)

In [50]:
classification_report(np_pred, np_actual)

'              precision    recall  f1-score   support\n\n      Female       1.00      1.00      1.00         1\n\n    accuracy                           1.00         1\n   macro avg       1.00      1.00      1.00         1\nweighted avg       1.00      1.00      1.00         1\n'