In [None]:
import pickle
import pandas
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.naive_bayes import GaussianNB
from torch.functional import norm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from gensim.parsing.preprocessing import preprocess_string
import gensim.downloader as api
from gensim.models import Word2Vec
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
with open('drive/MyDrive/CS247/train_data.pkl', 'rb') as fid:
     train_data= pickle.load(fid)

with open('drive/MyDrive/CS247/valid_data.pkl', 'rb') as fid:
     valid_data= pickle.load(fid)

with open('drive/MyDrive/CS247/test_data.pkl', 'rb') as fid:
     test_data= pickle.load(fid)

with open('drive/MyDrive/CS247/data.pkl', 'rb') as fid:
     data= pickle.load(fid)

In [None]:
## FUNCTIONS

## =================== DATA PROCESSING ==================== ##
def data_preprocess(comments, MIN_COUNT=1):
  sent_ids, sent_wds  = [], []
  word_count, word2id, id2word = {}, {}, {}

  for doc in comments:
      for word in doc:
          if word not in word_count:
              word_count[word] = 0
          word_count[word] += 1

  for doc in comments:
      sent_id = []
      sent_wd = []
      for word in doc:
          if word_count[word] < MIN_COUNT:
              continue
          if word not in word2id:
              idx = len(id2word)
              word2id[word] = idx
              id2word[idx]  = word
          sent_id += [word2id[word]]
          sent_wd += [word]
      
      sent_ids += [sent_id]
      sent_wds += [sent_wd]

  return sent_ids, sent_wds, word_count, word2id, id2word

def remove_empty_tokenized_values(data):
  return data.loc[data["comment_tokenize"].str.len() != 0]

def index_of_empty_tokenized_values(data):
  return [i for i, val in enumerate(data["comment_tokenize"].tolist()) if len(val) == 0]

## =========== GET THE EMBEDDINGS FOR THE WORDS =========== ##
def get_emb_avg(tokenized_data, word_embeddings, dim):
  avg_emb = []
  for doc in tokenized_data:
    sum_emb = np.zeros(dim)
    for wrd in doc:
      curr_emb = word_embeddings[wrd]
      sum_emb += curr_emb
    avg_emb += [sum_emb / len(doc)]
  return avg_emb

def weighted_emb(tokenized_data, word_embeddings, avg_emb, dim):
  weighted_result = []
  print(len(tokenized_data)//1000)
  for i, doc in enumerate(tokenized_data):
    sum_emb = np.zeros(dim)
    for wrd in doc:
      curr_emb = word_embeddings.get_vector(wrd)
      res = cosine_similarity(avg_emb[i].reshape(1, -1), curr_emb.reshape(1, -1))
      sum_emb += avg_emb[i] * res[0][0]
    weighted_result += [sum_emb / len(doc)]
    if i%1000 == 0:
      print("|", end="", flush=True)
      
  return weighted_result

## =================== SAVE DATA INTO FILE ==================== ##
def save_into_drive(data, filename):
  with open(filename, 'wb') as fid:
    pickle.dump(data, fid)

def read_from_drive(filename):
  with open(filename, 'rb') as fid:
    return pickle.load(fid)

In [None]:
## PREPROCESS DATA
train_sent_ids, train_sent_wds, train_word_count, train_word2id, train_id2word = data_preprocess(train_data['comment_tokenize'])
val_sent_ids, val_sent_wds, val_word_count, val_word2id, val_id2word = data_preprocess(valid_data['comment_tokenize'])
test_sent_ids, test_sent_wds, test_word_count, test_word2id, test_id2word = data_preprocess(test_data['comment_tokenize'])

In [None]:
## CHECK FOR EMPTY VALUES
print("Train: ", len(index_of_empty_tokenized_values(train_data)) == 0)
print("Valid: ", len(index_of_empty_tokenized_values(valid_data)) == 0)
print("Test: ", len(index_of_empty_tokenized_values(test_data)) == 0)

## REMOVE EMPTY VALUES FROM DATASETS
train_data = remove_empty_tokenized_values(train_data) 
valid_data = remove_empty_tokenized_values(valid_data) 
test_data = remove_empty_tokenized_values(test_data) 

Train:  False
Valid:  False
Test:  False


In [None]:
## WORD2VEC GENSIM
train_model = Word2Vec(train_sent_wds, min_count=1)
val_model = Word2Vec(val_sent_wds, min_count=1)
test_model = Word2Vec(test_sent_wds, min_count=1)

In [None]:
## COMPUTE EMBEDDINGS
train_word_embeddings = train_model.wv
train_avg_emb = get_emb_avg(train_data['comment_tokenize'], train_word_embeddings, train_model.vector_size)
train_weighted_result = weighted_emb(train_data['comment_tokenize'], train_word_embeddings, train_avg_emb, train_model.vector_size)

103
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

In [None]:
val_word_embeddings = val_model.wv
val_avg_emb = get_emb_avg(valid_data['comment_tokenize'], val_word_embeddings, val_model.vector_size)
val_weighted_result = weighted_emb(valid_data['comment_tokenize'], val_word_embeddings, val_avg_emb, val_model.vector_size)

55
||||||||||||||||||||||||||||||||||||||||||||||||||||||||

In [None]:
test_word_embeddings = test_model.wv
test_avg_emb = get_emb_avg(test_data['comment_tokenize'], test_word_embeddings, test_model.vector_size)
test_weighted_result = weighted_emb(test_data['comment_tokenize'], test_word_embeddings, test_avg_emb, test_model.vector_size)

63
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

In [None]:
## TRAIN MODEL
nb = GaussianNB()
nb.fit(train_weighted_result, train_data["toxic"])

GaussianNB()

In [None]:
## SCORE ACCURACY (just to check that the datasets don't raise errors)
val_prob = nb.predict(val_weighted_result)
test_prob = nb.predict(test_weighted_result)
print(nb.score(val_weighted_result, valid_data["toxic"]))
print(nb.score(test_weighted_result, test_data["toxic"]))

0.6770131824043559
0.37076264517956625


In [None]:
## GENERATE DATAFRAMES TO BE SAVED
train_emb = zip(train_weighted_result, train_data["toxic"], train_data["severe_toxic"], train_data["insult"], train_data["obscene"], train_data["threat"], train_data["identity_hate"])
valid_emb = zip(val_weighted_result, valid_data["toxic"], valid_data["severe_toxic"], valid_data["insult"], valid_data["obscene"], valid_data["threat"], valid_data["identity_hate"])
test_emb = zip(test_weighted_result, test_data["toxic"], test_data["severe_toxic"], test_data["insult"], test_data["obscene"], test_data["threat"], test_data["identity_hate"])

In [None]:
## SAVE EMBEDDED DATA INTO FILES
save_into_drive(train_emb, 'train_emb.pkl')
save_into_drive(valid_emb, 'val_emb.pkl')
save_into_drive(test_emb, 'test_emb.pkl')

## Appendix
### Compare Gensim model with our Skipgram implementation

In [None]:
#GENERATE DATA FOR SKIPGRAM

WINDOW_SIZE = 2 #smaller window size than Gensim

data_skipgram = []

for sent in train_sent_ids:
    for i in range(WINDOW_SIZE, len(sent) - WINDOW_SIZE):
        context = [sent[i - WINDOW_SIZE: i] + sent[i+1: i + WINDOW_SIZE + 1]]
        target  = sent[i]
        data_skipgram.append((context, target))

print("Data_length:",len(data_skipgram))

Data_length: 3198009


In [None]:
#OUR SKIPGRAM IMPLEMENTATION

class SkipGram(nn.Module):

    def __init__(self, vocab_size, hidden_size):
        super(SkipGram, self).__init__()
        self.u_emb = nn.Embedding(vocab_size, hidden_size)
        self.v_emb = nn.Embedding(vocab_size, hidden_size)

    def forward(self, idx):
        return self.u_emb(idx)

    def loss(self, pos_data, neg_data):

        loss = 0
        targets = []
        all_pos_words = []

        #GENERATING NEGATIVES
        all_neg_words = neg_data
     
        #GENERATING TARGET AND POSITIVES
        for id in range(len(pos_data)):
          targets.append(pos_data[id][1])
          for pid in range(len(pos_data[id][0][0])):
            all_pos_words.append(pos_data[id][0][0][pid])

        targets = np.array(targets)
        all_pos_words = np.array(all_pos_words)

        #GENERATING 3D TENSORS
        v_w = self.v_emb(torch.LongTensor(targets).to(device))
        u_w = self.u_emb(torch.LongTensor(all_pos_words).to(device))
        u_w_prime = self.u_emb(torch.LongTensor(all_neg_words).to(device))

        v_w_pos = v_w.view(len(pos_data),v_w.shape[1],1)
        u_w_pos = u_w.view(len(pos_data),2*WINDOW_SIZE,u_w.shape[1])
        u_w_neg = u_w_prime.view(len(pos_data),10*2*WINDOW_SIZE,u_w_prime.shape[1])
        
        #LOSS POS + NEG
        pos_loss = torch.bmm(u_w_pos,v_w_pos).sigmoid().log()
        pos = pos_loss.sum(1) #sum over the 4 examples for each target word
        neg_loss = torch.bmm(u_w_neg.neg(),v_w_pos).sigmoid().log()
        neg = neg_loss.sum(1) #sum over the 40 examples for each target word

        loss = -(pos+neg).mean()

        return loss
  

In [None]:
#DEFINE DEVICE

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"running on {device}")

running on cuda


In [None]:
#TRAINING OUR MODEL
MIN_COUNT = 1

skipgram = SkipGram(len(train_word2id),100).to(device)
optimizer = optim.Adam(skipgram.parameters())

vocabulary = {key: value for key, value in train_word_count.items() if value >= MIN_COUNT}
N = sum(vocabulary.values())
word_prob = {key: value/N for (key, value) in vocabulary.items()}
word_ID = list(word_prob.keys())
word_ID = [train_word2id[word] for word in word_ID]

neg_sample_count = 10
itr_num = 30
batch_size = 2000

l = []
for i in range(itr_num):
    print("iteration: ",i)
    s = 0
    for bid in range(len(data_skipgram) // batch_size): 
        optimizer.zero_grad()
        positive_data = data_skipgram[bid * batch_size : (bid + 1) * batch_size]
        neg_data = []

        word_prob_array = np.array(list(word_prob.values()))
        word_ID_array = np.array(word_ID)

        neg_data = np.random.choice(word_ID_array,
                                    size=2*WINDOW_SIZE*neg_sample_count*batch_size,
                                    p=word_prob_array)
            
        loss = skipgram.loss(positive_data, neg_data)
        loss.backward()
        s += loss
        optimizer.step()
    l.append(s.item()/(len(data) // batch_size))
    print("Average Loss for the current iteration: ", l[i])
    print("-----------------------------------")

In [None]:
#RESULTS FOR OUR IMPLEMENTATION

word_embs  = skipgram.u_emb.weight.data
target_emb = skipgram.forward(torch.LongTensor([train_word2id['stupid']]).to(device))
# cosine similarity
similarity = (word_embs * target_emb).sum(dim=1) / (torch.norm(word_embs, dim=1) * torch.norm(target_emb))
for idx in similarity.argsort(descending=True).cpu().numpy()[1:11]:
    print(train_id2word[idx], similarity[idx].item())

dumb 0.6962757110595703
stupid 0.6913631558418274
lick 0.6846228837966919
fat 0.6817508339881897
suck 0.6773995757102966
mom 0.6771062016487122
hell 0.6728000640869141
ass 0.6692366600036621
faggot 0.6688790917396545
banned 0.6661044359207153


In [None]:
#RESULTS FOR GENSIM WORD2VECT MODEL

#Min count set to 1 (we keep all the words)
model = Word2Vec(train_sent_wds, min_count=1) 

similar = model.wv.most_similar("stupid")
for word in similar:
  print(word)

In [None]:
#WHICH MODEL TO USE?

#We opted for Gensim --> more optimized

### Data analysis

In [None]:
#PERFORM SOME ANALYSIS ON SOME TOXIC WORDS 

#note: to be generalized to multiple words

def word_count_in_comments(target, comment):
  count = 0
  for doc in comment:
    if target in doc:
      count += 1
    else:
      continue

  contains_target = train_data.loc[train_data["comment"].str.contains(target)]
  count_toxic = contains_target['toxic'].sum(0)
  count_severe_toxic = contains_target['severe_toxic'].sum(0)
  count_obscene = contains_target['obscene'].sum(0)
  count_threat = contains_target['threat'].sum(0)
  count_insult = contains_target['insult'].sum(0)
  count_identity_hate = contains_target['identity_hate'].sum(0)

  print("Comments containing the target word: {}" .format(count))
  print("Comments labeled as toxic: {}" .format(count_toxic))
  print("Comments labeled as severe toxic: {}" .format(count_severe_toxic))
  print("Comments labeled as obscene: {}" .format(count_obscene))
  print("Comments labeled as threat: {}" .format(count_threat))
  print("Comments labeled as insult: {}" .format(count_insult))
  print("Comments labeled as identity hate: {}" .format(count_identity_hate))

In [None]:
w1 = 'stupid'
w2 = 'flower'
print(w1)
word_count_in_comments(w1, train_data['comment_tokenize'])
print(w2)
word_count_in_comments(w2, train_data['comment_tokenize'])