<a href="https://colab.research.google.com/github/thiemcun203/testgithub/blob/main/gender_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

KeyboardInterrupt: ignored

In [None]:
import pandas as pd
import numpy as np

import re
import sys

In [None]:
parent_path = "/content/drive/MyDrive/Gender prediction by Vietnamese name/dataset"
training_path = parent_path + "/name_train.csv"
dev_path = parent_path + "/name_dev.csv"
test_path = parent_path + "/name_test.csv"
data_path = parent_path + "/name_full.csv"
df = pd.read_csv(data_path)

In [None]:
df.head()

Unnamed: 0,Full_Name,Gender
0,Ngô Xuân Tùng,1
1,Bùi Dương Thảo Vy,0
2,Lưu Thế Huy,1
3,Nguyễn Thị Vân,0
4,Dương Minh Long,1


In [None]:
def convert(text):
  patterns = {
  '[àáảãạăắằẵặẳâầấậẫẩ]': 'a',
  '[đ]': 'd',
  '[èéẻẽẹêềếểễệ]': 'e',
  '[ìíỉĩị]': 'i',
  '[òóỏõọôồốổỗộơờớởỡợ]': 'o',
  '[ùúủũụưừứửữự]': 'u',
  '[ỳýỷỹỵ]': 'y'
  }
  output = text
  for regex, replace in patterns.items():
    output = re.sub(regex, replace, output)
    # deal with upper case
    output = re.sub(regex.upper(), replace.upper(), output)
  return output.lower()

In [None]:
full_data = df["Full_Name"]

In [None]:
class Compute_TF_IDF():
  def __init__(self, list_document, dictionary=None, max_count=None, min_count=10, normalize_tf=False, smooth=True, normalize_tfidf=None):
    self.list_document = list_document
    self.max_count = max_count
    self.min_count = min_count
    self.normalize_tf = normalize_tf
    self.smooth = smooth
    self.normalize_tfidf = normalize_tfidf

    self.dictionary = dictionary if dictionary != None else self.create_dictionary()
    self.word_to_index = self.mapping_word_to_index()
    self.num_word = len(self.dictionary)
    self.num_document = len(self.list_document)
    self.matrix_word_count = self.create_count_matrix()
    self.idf_score = self.compute_idf()
  
  # Query word given index based on dictionary
  def retrieve_word(self, index)i:
    if 0 <= index <= self.num_word - 1:
      return self.dictionary[index]
    else:
      return -1
  
  # Query index given word based on dictionary
  def retrieve_index(self, word):
    return self.word_to_index.get(word.lower(), -1)

  # Split document into list of words
  def word_extraction(self, document):
    split_word = document.split()
    return split_word

  def map_word_to_count(self):
    dict_word_count = dict()
    for i in range(len(self.list_document)):
      list_word = self.word_extraction(self.list_document[i].lower())
      for j in range(len(list_word)):
        dict_word_count[list_word[j]] = dict_word_count.get(list_word[j], 0) + 1
    return dict_word_count

  def create_dictionary(self):
    if self.max_count == None and self.min_count == None:
      set_word = set()
      for document in self.list_document:
        set_word = set_word.union(set(self.word_extraction(document.lower())))
    else:
      set_word = set()
      mapping_word_count = self.map_word_to_count()
      for document in self.list_document:
        list_word = self.word_extraction(document.lower())
        for word in list_word:
          if self.min_count != None:
            if mapping_word_count[word] < self.min_count:
              continue
          if self.max_count != None:
            if mapping_word_count[word] > self.max_count:
              continue
          set_word.add(word)
    return sorted(list(set_word))

  def mapping_word_to_index(self):
    dict_encode = dict()
    for i in range(len(self.dictionary)):
      dict_encode[self.dictionary[i]] = i
    return dict_encode

  def create_count_matrix(self):
    mat = np.zeros((self.num_document, self.num_word))
    for i in range(len(self.list_document)):
      document = self.list_document[i].lower()
      list_word = self.word_extraction(document)
      for j in range(len(list_word)):
        ind = self.retrieve_index(list_word[j])
        mat[i, ind] += 1
    return mat

  def compute_tf(self):
    length_name = np.sum(self.matrix_word_count, axis=1)
    if self.normalize_tf == True:
      return self.matrix_word_count / np.reshape(length_name, (-1,1))
    else:
      return self.matrix_word_count

  def compute_idf(self):
    tmp = np.copy(self.matrix_word_count)
    tmp[tmp!=0] = 1
    num_doc_having_word = np.sum(tmp, axis=0)
    if self.smooth == True:
      # smoothen and avoid 0 in idf
      num_doc_having_word = np.log((self.num_document + 1) / (num_doc_having_word + 1)) + 1
    else:
      # avoid 0 in idf
      num_doc_having_word = np.log(self.num_document / num_doc_having_word) + 1
    return np.reshape(num_doc_having_word, (1, self.num_word))

  def compute_tf_idf(self):
    tf = self.compute_tf()
    idf = self.compute_idf()
    tfidf = tf * idf
    if self.normalize_tfidf == None:
      return tfidf
    elif self.normalize_tfidf == "l2":
      sum_squares = np.reshape(np.diag(tfidf.dot(tfidf)), (1,-1))
      return tfidf / sum_squares
    elif self.normalize_tfidf == "l1":
      sum_row = np.reshape(np.sum(tfidf, axis=1), (1,-1))
      return tfidf / sum_row


### Compute tf idf for test set
  def create_count_matrix_for_test(self, list_doc):
    mat = np.zeros((len(list_doc), self.num_word))
    for i in range(len(list_doc)):
      document = list_doc[i].lower()
      list_word = self.word_extraction(document)
      for j in range(len(list_word)):
        ind = self.retrieve_index(list_word[j])
        if ind != -1:
          mat[i, ind] += 1
    return mat
  
  def compute_tf_for_test(self, matrix_count_document):
    length_name = np.sum(matrix_count_document, axis=1)
    if self.normalize_tf == True:
      return matrix_count_document / np.reshape(length_name, (-1,1))
    else:
      return matrix_count_document

  def compute_tf_idf_for_test(self, document):
    matrix = self.create_count_matrix_for_test(document)
    tf = self.compute_tf_for_test(matrix)
    idf = self.idf_score
    tfidf = tf * idf
    if self.normalize_tfidf == None:
      return tfidf
    elif self.normalize_tfidf == "l2":
      sum_squares = np.reshape(np.diag(tfidf.dot(tfidf)), (1,-1))
      return tfidf / sum_squares
    elif self.normalize_tfidf == "l1":
      sum_row = np.reshape(np.sum(tfidf, axis=1), (1,-1))
      return tfidf / sum_row

In [None]:
sample_data = ["Nguyen THi Minh KHaI", "Le Dai ThANH", "Hello"]

In [None]:
TF_IDF = Compute_TF_IDF(full_data)

tf_idf_matrix = TF_IDF.compute_tf_idf()
print(TF_IDF.dictionary)
print(TF_IDF.compute_tf_idf_for_test(["Phạm Quang Tùng"]).shape)

['an', 'anh', 'bá', 'bách', 'bé', 'bình', 'bích', 'bùi', 'băng', 'bạch', 'bảo', 'bắc', 'bằng', 'bửu', 'cao', 'chau', 'chi', 'chinh', 'chiến', 'chu', 'chung', 'chánh', 'châu', 'chí', 'chính', 'chương', 'chấn', 'chế', 'cáp', 'cát', 'công', 'cúc', 'cương', 'cường', 'cảnh', 'cẩm', 'danh', 'di', 'diểm', 'diễm', 'diệp', 'diệu', 'doãn', 'du', 'dung', 'duy', 'duyên', 'dân', 'dũng', 'dư', 'dương', 'gia', 'giang', 'giao', 'giàu', 'h', 'hiếu', 'hiền', 'hiển', 'hiệp', 'hiệu', 'hoa', 'hoan', 'hoà', 'hoài', 'hoàn', 'hoàng', 'huy', 'huynh', 'huyền', 'huân', 'huấn', 'huế', 'huệ', 'huỳnh', 'hy', 'hà', 'hào', 'hân', 'hòa', 'hùng', 'hưng', 'hương', 'hướng', 'hường', 'hạ', 'hạnh', 'hải', 'hảo', 'hậu', 'hằng', 'học', 'hồ', 'hồng', 'hội', 'hợp', 'hứa', 'hửu', 'hữu', 'k', 'ka', 'kha', 'khang', 'khanh', 'khiêm', 'khoa', 'khuê', 'khánh', 'khôi', 'khương', 'khả', 'khải', 'khắc', 'kim', 'kiên', 'kiều', 'kiệt', 'kỳ', 'la', 'lai', 'lam', 'lan', 'linh', 'liêm', 'liên', 'liễu', 'loan', 'long', 'luân', 'luận', 'luật'

In [None]:
from sklearn.decomposition import TruncatedSVD
# Need to understand
svd = TruncatedSVD(n_components=100)
svd.fit(tf_idf_matrix)
print(svd.transform(tf_idf_matrix))

[[ 4.81338714e-01  1.01212602e-01 -9.33066191e-02 ...  1.27155550e-01
   3.11833666e-01 -2.04977176e-03]
 [ 9.49502118e-01 -3.41938201e-01  2.72760219e-01 ... -6.17816185e-01
  -1.17647373e-01  2.87285071e-01]
 [ 3.75913401e-01  3.50340334e-01 -3.27379166e-01 ... -1.43920609e+00
  -8.95230242e-01  4.09109223e-01]
 ...
 [ 2.94594688e+00 -1.20606723e+00 -2.27589826e-01 ... -9.65750760e-02
   1.81647499e-02  3.66520003e-02]
 [ 2.82959177e+00 -1.05672395e+00 -4.74989131e-01 ...  4.38205219e-02
  -1.92313732e-01 -9.35709821e-02]
 [ 7.44438324e-01 -3.97130970e-01  4.09974456e-01 ...  5.29780516e-01
  -4.65016057e-01  8.53292770e-01]]


In [None]:
from sklearn.linear_model import LogisticRegression
# Can code manually
X = svd.transform(tf_idf_matrix)
y = df["Gender"]
clf = LogisticRegression().fit(X, y)

In [None]:
clf.score(X, y)

0.9355703698186287

In [None]:
y_pred = clf.predict(X)

In [None]:
name = ["Nguyễn Hương Ly"]
ifidf_test = TF_IDF.compute_tf_idf_for_test(name)
test_mat = svd.transform(ifidf_test)
clf.predict_proba(test_mat)

array([[0.65758317, 0.34241683]])