<a href="https://colab.research.google.com/github/tushitgarg/Hate-Speech-Content-Moderation/blob/master/Co_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing the Libraries

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import pymongo
from pymongo import MongoClient
import pprint as pp
! pip install normalise
import nltk
nltk.download('names')
nltk.download('brown')

import numpy as np
import multiprocessing as mp

import string
import spacy 
import en_core_web_sm
from nltk.tokenize import word_tokenize
from sklearn.base import TransformerMixin, BaseEstimator
from normalise import normalise
import re

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation,GRU, SimpleRNN
from keras.layers.embeddings import Embedding
from keras import layers
from keras.initializers import Constant

Collecting normalise
[?25l  Downloading https://files.pythonhosted.org/packages/28/2d/f06cf3d3714502dec10e19238a5da201b71ce198165beda9c1adaf5063da/normalise-0.1.8-py3-none-any.whl (15.7MB)
[K     |████████████████████████████████| 15.7MB 1.5MB/s 
Collecting roman
  Downloading https://files.pythonhosted.org/packages/8d/f2/29d1d069555855ed49c74b627e6af73cec7a5f4de27c200ea0d760939da4/roman-3.2-py2.py3-none-any.whl
Installing collected packages: roman, normalise
Successfully installed normalise-0.1.8 roman-3.2
[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


Using TensorFlow backend.


# Text Preprocessing Pipeline

In [0]:
nlp = en_core_web_sm.load()


class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 variety="BrE",
                 user_abbrevs={},
                 n_jobs=1):
        """
        Text preprocessing transformer includes steps:
            1. Text normalization
            2. Punctuation removal
            3. Stop words removal
            4. Lemmatization
        
        variety - format of date (AmE - american type, BrE - british format) 
        user_abbrevs - dict of user abbreviations mappings (from normalise package)
        n_jobs - parallel jobs to run
        """
        self.variety = variety
        self.user_abbrevs = user_abbrevs
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        X_copy = X.copy()
        return X_copy.apply(self._preprocess_text)

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        text=self._clean_text(text)
        normalized_text = self._normalize(text)
        doc = nlp(normalized_text)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)

    def _normalize(self, text):
        # some issues in normalise package
        try:
            return ' '.join(normalise(text, variety=self.variety, user_abbrevs=self.user_abbrevs, verbose=False))
        except:
            return text

    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if not t.is_stop]

    def _lemmatize(self, doc):
        return ' '.join([t.lemma_ for t in doc])
    
    
    def _clean_text(self,text):
      replace_1 = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
      replace_2 = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
      try:
        text=re.sub(r"http\S+", "", text)
      except:
        print(text)
      text = replace_1.sub("", text)
      text = replace_2.sub(" ", text)
      text=re.sub('\s+',' ',text)
      return text

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
cd '/content/drive/My Drive/minor2/Hate-Speech-Content-Moderation/'

/content/drive/My Drive/minor2/Hate-Speech-Content-Moderation


# Loading datasets

In [69]:
quora=pd.read_excel('quora.xlsx')
quora.head(5)

Unnamed: 0,qid,text,label
0,8eeb2fa6a60d93c5ce3c,"What is the difference between real, true, act...",0
1,420689e2da77a9254362,Can an applicant with JEST score in PhD merit ...,0
2,b4d8ce47a727326a8916,How can I get the syllabus of JEE Advanced for...,0
3,197be9aa5fbef88659ee,What is fee structure of KLE Sheshagiri Colleg...,0
4,a05cc194d3952fce5856,How do I pick few wines that will go with food...,0


In [5]:
twitter = pd.read_excel('twitter2_data.xlsx')
twitter.drop(['level_0','index','Unnamed: 0'],axis=1,inplace=True)
twitter.head()

Unnamed: 0,_id,text,label
0,5e4ce1ac5ca387d4c86d31d9,RT @fairbairn77: I'm not sexist or anything bu...,1
1,5e4cebeb5ca387d4c86d4ec1,RT @colonelkickhead: Apparently walking a catw...,0
2,5e4d048f5ca387d4c86d8084,RT @athenahollow: @freebsdgirl He REALLY picke...,0
3,5e4ce2125ca387d4c86d32a6,@JohnJohnisKilla Call me sexist or whatever it...,1
4,5e4ceb135ca387d4c86d4cd1,I don't understand mean girls. And certainly n...,0


In [10]:
wiki=pd.read_excel('wiki.xlsx')
wiki.head(5)

Unnamed: 0,id,text,label
0,4873f24af8928f39,Oh BedWetter... please stop your furious back-...,1
1,aa343515b55c73b8,"There's no conflict, a court martial can end w...",0
2,c7888738107f928a,what????how are the ones from willking1979's t...,0
3,48a42a867206a816,"""\n\nabuse or consentual??\n\n""""Homolka would ...",0
4,2c103909d79f9876,"""==Talk:""""Lane Splitting""""==\n\nDennis, why do...",1


In [7]:
wiki.shape

(10204, 3)

In [8]:
twitter.shape

(10841, 3)

In [9]:
quora.shape

(13842, 3)

In [0]:
np.mean(twitter['label'])

In [0]:
np.mean(quora['label'])

In [0]:
np.mean(quora['label'])

# Extracting text and labels

In [0]:
twitter_text=twitter['text']
twitter_labels=twitter['label']

In [0]:
quora_text=quora['text']
quora_labels = quora['label']

In [0]:
wiki_text=wiki['text']
wiki_labels=wiki['label']

# Cleaning and preprocessing text

In [13]:
%%time
twitter_text = TextPreprocessor(n_jobs=-1).transform(twitter['text'])

CPU times: user 1min 53s, sys: 885 ms, total: 1min 54s
Wall time: 1min 54s


In [0]:
%%time
quora_text = TextPreprocessor(n_jobs=-1).transform(quora['text'])

In [14]:
%%time
wiki_text = TextPreprocessor(n_jobs=-1).transform(wiki['text'])

CPU times: user 2min 58s, sys: 1.23 s, total: 2min 59s
Wall time: 2min 59s


# Splitting the text 

In [0]:
twitter_text2=[]
for i in twitter_text:
      lst=i.split()
      twitter_text2.append(lst)

In [0]:
quora_text2=[]
for i in quora_text:
      lst=i.split()
      quora_text2.append(lst)

In [0]:
wiki_text2=[]
for i in wiki_text:
      lst=i.split()
      wiki_text2.append(lst)

# Loading Glove embeddings and making embeddings dictionary

In [33]:
print('Loading word vectors')
embeddings_index = {} # We create a dictionary of word -> embedding
f = open('/content/drive/My Drive/minor2/twitter2/glove.6B.100d.txt','r') # Open file
# In the dataset, each line represents a new word embedding
# The line starts with the word and the embedding values follow
for line in tqdm(f):
    values = line.split()
    word = values[0] # The first value is the word, the rest are the values of the embedding
    values = np.asarray(values[1:], dtype='float32') # Load embedding
    embeddings_index[word] = values # Add embedding to our embedding dictionary
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Loading word vectors


400001it [00:16, 24905.67it/s]

Found 400001 word vectors.





In [34]:
all_embs = np.stack(embeddings_index.values())
emb_mean = all_embs.mean() # Calculate mean
emb_std = all_embs.std() # Calculate standard deviation
emb_mean,emb_std

  if self.run_code(code, result):


(0.0044520576, 0.40815717)

# Tokenizing and padding the texts

In [21]:
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(twitter_text2) 
sequences = tokenizer.texts_to_sequences(twitter_text2)
word_index = tokenizer.word_index
print('Found %s unique tokens.',len(word_index))
twitter_maxlen = max([len(s.split()) for s in twitter_text])
tweets_pad = pad_sequences(sequences, maxlen=twitter_maxlen)
print(tweets_pad.shape) 


Found %s unique tokens. 14884
(10841, 27)


In [23]:
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(quora_text2) 
sequences = tokenizer.texts_to_sequences(quora_text2)
word_index = tokenizer.word_index
print('Found %s unique tokens.',len(word_index))
quora_maxlen = max([len(s.split()) for s in quora_text])
quora_pad = pad_sequences(sequences, maxlen=quora_maxlen)
print(quora_pad.shape) 


Found %s unique tokens. 28110
(13842, 58)


In [0]:
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(wiki_text2) 
sequences = tokenizer.texts_to_sequences(wiki_text2)
word_index = tokenizer.word_index
print('Found %s unique tokens.',len(word_index))
wiki_maxlen = max([len(s.split()) for s in wiki_text])
wiki_pad = pad_sequences(sequences, maxlen=wiki_maxlen)
print(wiki_pad.shape) 

# Making dataframes

In [0]:
twitter_df=pd.concat([pd.DataFrame(twitter_text),pd.DataFrame(tweets_pad)],axis=1)

In [0]:
quora_df=pd.concat([pd.DataFrame(quora_text),pd.DataFrame(quora_pad)],axis=1)

In [0]:
wiki_df=pd.concat([pd.DataFrame(wiki_text),pd.DataFrame(wiki_pad)],axis=1)

# Embedding matrix

TypeError: ignored

# Co training Iterations

In [0]:
quora=pd.read_excel('quora.xlsx')

In [0]:
twitter = pd.read_excel('twitter2_data.xlsx')
twitter.drop(['level_0','index','Unnamed: 0'],axis=1,inplace=True)

In [0]:
wiki=pd.read_excel('wiki.xlsx')

In [0]:
quora.columns = twitter.columns
wiki.columns = twitter.columns

In [0]:
def LoadGlove():
  print('Loading word vectors')
  embeddings_index = {}
  f = open('/content/drive/My Drive/minor2/twitter2/glove.6B.100d.txt','r')
  for line in tqdm(f):
      values = line.split()
      word = values[0]
      values = np.asarray(values[1:], dtype='float32')
      embeddings_index[word] = values
  f.close()
  print('Found %s word vectors.' % len(embeddings_index))
  return embeddings_index

In [0]:
def make_embedding_matrix(tokenizer,emb_mean,emb_std,embeddings_index):
  embedding_dim = 100
  word_index = tokenizer.word_index
  num_words = len(word_index)+1
  embedding_matrix = np.random.normal(emb_mean, emb_std, (num_words, embedding_dim))
  for word, i in word_index.items():
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None: 
          embedding_matrix[i] = embedding_vector

In [0]:
def CoTrain(Train,Test):
  embeddings_index = LoadGlove()
  train_text = Train['text']
  train_labels = Train['label']
  test_text = Test['text']
  test_labels = Test['label']



In [85]:
from sklearn.utils import shuffle
x = twitter
yl = quora.sample(frac=0.2)
yu = quora.drop(list(yl.index))
K = 5
r = len(yl)//5
for k in range(K+1):
  #print(x.shape)
  #CoTrain(x,yu)
  z = yl.iloc[:r,:]
  yl = yl.drop(list(z.index))
  x =  pd.concat([x,z],axis=0)
  shuffle(x)
  x.reset_index(inplace=True, drop=True)

(10841, 3)
(2215, 3)
(11394, 3)
(1662, 3)
(11947, 3)
(1109, 3)
(12500, 3)
(556, 3)
(13053, 3)
(3, 3)
(13606, 3)
(0, 3)
