In [1]:
import spacy
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import re
import jovian
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error
from torchtext import data
from torchtext import datasets
from collections import Counter




<IPython.core.display.Javascript object>

In [2]:
#This new file removes gender rows other than 'male'
#and 'female'
tweets = pd.read_csv(r'Desktop/tweets_gender_new.csv', encoding = "ISO-8859-1")

In [3]:
#I removed unnecessary columns and also create a new
#column which combines the description and text
#columns

tweets.pop('ï»¿unit_id')
tweets.pop('_golden')
tweets.pop('_unit_state')
tweets.pop('_trusted_judgments')
tweets.pop('_last_judgment_at')
tweets.pop('profile_yn')
tweets.pop('profile_yn:confidence')
tweets.pop('created')
tweets.pop('fav_number')
tweets.pop('gender_gold')
tweets.pop('link_color')
tweets.pop('name')
tweets.pop('profile_yn_gold')
tweets.pop('profileimage')
tweets.pop('retweet_count')
tweets.pop('sidebar_color')
tweets.pop('tweet_coord')
tweets.pop('tweet_count')
tweets.pop('tweet_created')
tweets.pop('tweet_id')
tweets.pop('tweet_location')
tweets.pop('user_timezone')

tweets['Profile'] = tweets['description'] + ' ' + tweets['text']

In [34]:
#I use 1 and -1 to represent the two genders
tweets['gender'] = tweets['gender'].replace(['male'], 1)
tweets['gender'] = tweets['gender'].replace(['female'], 0)


tweets.head()

Unnamed: 0,gender,gender:confidence,description,text,Profile,encoded
0,1,1.0,i sing my own rhythm.,Robbie E Responds To Critics After Win Against...,i sing my own rhythm. Robbie E Responds To Cri...,"[[1, 2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1,..."
1,1,1.0,I'm the author of novels filled with family dr...,Ã¢â¬ÅIt felt like they were my friends and I...,I'm the author of novels filled with family dr...,"[[12, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, ..."
2,1,0.6625,louis whining and squealing and all,i absolutely adore when louis starts the songs...,louis whining and squealing and all i absolute...,"[[25, 33, 34, 35, 36, 37, 11, 38, 17, 39, 40, ..."
3,1,1.0,"Mobile guy. 49ers, Shazam, Google, Kleiner Pe...",Hi @JordanSpieth - Looking at the url - do you...,"Mobile guy. 49ers, Shazam, Google, Kleiner Pe...","[[45, 12, 1, 14, 46, 47, 11, 48, 14, 49, 50, 5..."
4,0,1.0,Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...,Watching Neighbours on Sky+ catching up with t...,Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...,"[[63, 64, 58, 65, 12, 66, 67, 29, 11, 1, 14, 6..."


In [35]:
nlp = spacy.load('en_core_web_sm')

#tokenizing function
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in nlp.tokenizer(nopunct)]


In [36]:
#count number of occurences of each word
#for some reason, I wasn't able to do this on the 
#'Profile' column
counts = Counter()
for index, row in tweets.iterrows():
    counts.update(tokenize(row['text']))


In [37]:
#delete words that occur rarely, or in other words
#less than 2 times
for word in list(counts):
    if counts[word] < 2:
        del counts[word]

In [38]:
#we create a vocabulary from the text
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [39]:
#encode each text as tensors/vectors
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [40]:
tweets['encoded'] = tweets['text'].apply(lambda x: np.array(encode_sentence(x, vocab2index)))
tweets.head()

Unnamed: 0,gender,gender:confidence,description,text,Profile,encoded
0,1,1.0,i sing my own rhythm.,Robbie E Responds To Critics After Win Against...,i sing my own rhythm. Robbie E Responds To Cri...,"[[1, 2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1,..."
1,1,1.0,I'm the author of novels filled with family dr...,Ã¢â¬ÅIt felt like they were my friends and I...,I'm the author of novels filled with family dr...,"[[12, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, ..."
2,1,0.6625,louis whining and squealing and all,i absolutely adore when louis starts the songs...,louis whining and squealing and all i absolute...,"[[25, 33, 34, 35, 36, 37, 11, 38, 17, 39, 40, ..."
3,1,1.0,"Mobile guy. 49ers, Shazam, Google, Kleiner Pe...",Hi @JordanSpieth - Looking at the url - do you...,"Mobile guy. 49ers, Shazam, Google, Kleiner Pe...","[[45, 12, 1, 14, 46, 47, 11, 48, 14, 49, 50, 5..."
4,0,1.0,Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...,Watching Neighbours on Sky+ catching up with t...,Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...,"[[63, 64, 58, 65, 12, 66, 67, 29, 11, 1, 14, 6..."


In [41]:
from sklearn.model_selection import train_test_split

In [42]:
#set the label and input data
X = list(tweets['encoded'])
y = list(tweets['gender'])
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2)


In [43]:
#create a dataset class
class ProfileDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [44]:
train_ds = ProfileDataset(X_train, y_train)
valid_ds = ProfileDataset(X_valid, y_valid)


In [45]:
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.long()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

In [46]:
def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total

In [47]:
batch_size = 5000
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size = batch_size, shuffle = True)
val_dl = DataLoader(valid_ds, batch_size = batch_size)

In [48]:
#this model is based on the glove word embedding space
#download the necessary files from https://jovian.ai/outlink?url=https%3A%2F%2Fnlp.stanford.edu%2Fprojects%2Fglove%2F

def load_glove_vectors(glove_file=r"Downloads/glove/glove.twitter.27B.100d.txt"):
    """Load the glove word vectors"""
    word_vectors = {}
    with open(glove_file) as f:
        for line in f:
            split = line.split()
            word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
    return word_vectors


In [49]:
def get_emb_matrix(pretrained, word_counts, emb_size = 100):
    """ Creates embedding matrix from word vectors"""
    vocab_size = len(word_counts) + 2
    vocab_to_idx = {}
    vocab = ["", "UNK"]
    W = np.zeros((vocab_size, emb_size), dtype="float32")
    W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
    W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words 
    vocab_to_idx["UNK"] = 1
    i = 2
    for word in word_counts:
        if word in word_vecs:
            W[i] = word_vecs[word]
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
        vocab_to_idx[word] = i
        vocab.append(word)
        i += 1   
    return W, np.array(vocab), vocab_to_idx

In [50]:
word_vecs = load_glove_vectors()
pretrained_weights, vocab, vocab2index = get_emb_matrix(word_vecs, counts)

In [51]:
class LSTM_glove_vecs(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
        self.embeddings.weight.requires_grad = False ## freeze embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [52]:
model = LSTM_glove_vecs(vocab_size, 100, 100, pretrained_weights)

In [53]:
train_model(model, epochs=30, lr=0.1)



train loss 1.213, val loss 1.054, val accuracy 0.508, and val rmse 0.702
train loss 2.521, val loss 3.895, val accuracy 0.406, and val rmse 0.949
train loss 0.872, val loss 0.902, val accuracy 0.508, and val rmse 0.702
train loss 1.557, val loss 0.774, val accuracy 0.508, and val rmse 0.702
train loss 0.765, val loss 0.735, val accuracy 0.499, and val rmse 0.708
train loss 0.709, val loss 0.701, val accuracy 0.482, and val rmse 0.720


In [88]:
train_model(model, epochs=30, lr=0.01)

train loss 0.702, val loss 0.702, val accuracy 0.489, and val rmse 0.715
train loss 0.695, val loss 0.697, val accuracy 0.507, and val rmse 0.702
train loss 0.696, val loss 0.703, val accuracy 0.511, and val rmse 0.699
train loss 0.696, val loss 0.696, val accuracy 0.500, and val rmse 0.707
train loss 0.694, val loss 0.693, val accuracy 0.520, and val rmse 0.693
train loss 0.695, val loss 0.695, val accuracy 0.503, and val rmse 0.705
