In [None]:
# Installing the transformers library and additional libraries if looking process 

!pip install -q transformers

# Code for TPU packages install
# !curl -q https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

[K     |████████████████████████████████| 1.3MB 5.9MB/s 
[K     |████████████████████████████████| 1.1MB 29.5MB/s 
[K     |████████████████████████████████| 890kB 52.9MB/s 
[K     |████████████████████████████████| 2.9MB 54.2MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd drive/My\ Drive/data
!ls

In [None]:
# Importing stock ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

# Preparing for TPU usage
# import torch_xla
# import torch_xla.core.xla_model as xm
# device = xm.xla_device()

In [None]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
df = pd.read_csv("train.csv")
df['list'] = df[df.columns[2:]].values.tolist()
new_df = df[['comment_text', 'list']].copy()
new_df.head()

Unnamed: 0,comment_text,list
0,Explanation\nWhy the edits made under my usern...,"[0, 0, 0, 0, 0, 0]"
1,D'aww! He matches this background colour I'm s...,"[0, 0, 0, 0, 0, 0]"
2,"Hey man, I'm really not trying to edit war. It...","[0, 0, 0, 0, 0, 0]"
3,"""\nMore\nI can't make any real suggestions on ...","[0, 0, 0, 0, 0, 0]"
4,"You, sir, are my hero. Any chance you remember...","[0, 0, 0, 0, 0, 0]"


In [None]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (159571, 2)
TRAIN Dataset: (127657, 2)
TEST Dataset: (31914, 2)


In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
               
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 6)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  0.6978594660758972
Epoch: 0, Loss:  0.0018328627338632941
Epoch: 0, Loss:  0.003809504210948944
Epoch: 0, Loss:  0.0637228935956955


In [None]:
pickle_in = open("model.pickle","rb")
model = pickle.load(pickle_in)

__main__.BERTClass

In [None]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            #print(data)
            ids = data['ids'].to(device, dtype = torch.long)
            print(ids)
            mask = data['mask'].to(device, dtype = torch.long)
            #print(mask.size())
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            #print(token_type_ids.size())
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

In [None]:
ids=torch.tensor(ids, dtype=torch.long),
mask=torch.tensor(mask, dtype=torch.long),
token_type_ids=torch.tensor(token_type_ids, dtype=torch.long),
           

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
comment_text='COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK'
comment_text = " ".join(comment_text.split())

inputs = tokenizer.encode_plus(
    comment_text,
    None,
    add_special_tokens=True,
    max_length=200,
    pad_to_max_length=True,
    return_token_type_ids=True
)
ids = inputs['input_ids']
mask = inputs['attention_mask']
token_type_ids = inputs["token_type_ids"]

ids=torch.tensor(ids, dtype=torch.long)
mask=torch.tensor(mask, dtype=torch.long)
token_type_ids=torch.tensor(token_type_ids, dtype=torch.long)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
ids=ids.resize_(1,200)
mask=mask.resize_(1,200)
token_type_ids=token_type_ids.resize_((1,200))

In [None]:
outputs = model(ids.to(device, dtype = torch.long),mask.to(device, dtype = torch.long),token_type_ids.to(device, dtype = torch.long))
x=torch.sigmoid(outputs).cpu().detach().numpy().tolist()

In [None]:
x

[[0.4231421947479248,
  0.40590953826904297,
  0.5346567034721375,
  0.3510940670967102,
  0.4764299988746643,
  0.5347267389297485]]

In [None]:
x = np.array(x) >= 0.5

In [None]:
x=x[0]

In [None]:
l=['toxic','severe_tocix','obscene','threat','insult','identity_hate']
for i in range(len(x)):
    if(x[i]==True):
        print(l[i])
    

obscene
identity_hate


In [None]:
x

array([ True, False,  True, False,  True, False])

In [None]:
def query(model,text):
    from transformers import BertTokenizer

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    #comment_text='COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK'
    comment_text = " ".join(text.split())

    inputs = tokenizer.encode_plus(
        comment_text,
        None,
        add_special_tokens=True,
        max_length=200,
        pad_to_max_length=True,
        return_token_type_ids=True
    )
    ids = inputs['input_ids']
    mask = inputs['attention_mask']
    token_type_ids = inputs["token_type_ids"]

    ids=torch.tensor(ids, dtype=torch.long)
    mask=torch.tensor(mask, dtype=torch.long)
    token_type_ids=torch.tensor(token_type_ids, dtype=torch.long)

    ids=ids.resize_(1,200)
    mask=mask.resize_(1,200)
    token_type_ids=token_type_ids.resize_((1,200))  
    outputs = model(ids.to(device, dtype = torch.long),mask.to(device, dtype = torch.long),token_type_ids.to(device, dtype = torch.long))
    x=torch.sigmoid(outputs).cpu().detach().numpy().tolist()  
    y = np.array(x) >= 0.5
    y=y[0]

    
    ans=[]
    for i in range(len(y)):
        if(y[i]==True):
            #print(l[i])
            ans.append(l[i])
    #x is the original predicted proba
    
    store=dict()
    store['probablity']=x
    store['predicted']=ans
    return store
    


In [None]:
query(model=model,text='COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK')

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'predicted': ['obscene', 'insult', 'identity_hate'],
 'probablity': [[0.4176866114139557,
   0.43764564394950867,
   0.5079596042633057,
   0.4370192885398865,
   0.509920060634613,
   0.5513231754302979]]}

In [None]:
from google.colab.output import eval_js
print(eval_js("google.colab.kernel.proxyPort(5000)"))

https://gsbkiawaq8e-496ff2e9c6d22116-5000-colab.googleusercontent.com/


In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 4.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 19.2MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 29.1MB/s 
Collecting tokenizers==0.9.2
[?25l  Downloading https://files.pythonhosted.org/packages/7c/a5/78be1a55b2ac8d6a956f0a211d372726e2b1dd2666bb537fea9b03abd62c/tokenizers-0.9.2-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[

In [None]:
from flask import request
from flask import Flask
app = Flask(__name__)
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
import pickle

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

def query(model,text):
    from transformers import BertTokenizer

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    #comment_text='COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK'
    comment_text = " ".join(text.split())

    inputs = tokenizer.encode_plus(
        comment_text,
        None,
        add_special_tokens=True,
        max_length=200,
        pad_to_max_length=True,
        return_token_type_ids=True
    )
    ids = inputs['input_ids']
    mask = inputs['attention_mask']
    token_type_ids = inputs["token_type_ids"]

    ids=torch.tensor(ids, dtype=torch.long)
    mask=torch.tensor(mask, dtype=torch.long)
    token_type_ids=torch.tensor(token_type_ids, dtype=torch.long)

    ids=ids.resize_(1,200)
    mask=mask.resize_(1,200)
    token_type_ids=token_type_ids.resize_((1,200))  
    outputs = model(ids.to(device, dtype = torch.long),mask.to(device, dtype = torch.long),token_type_ids.to(device, dtype = torch.long))
    x=torch.sigmoid(outputs).cpu().detach().numpy().tolist()  
    y = np.array(x) >= 0.5
    y=y[0]

    l=['toxic','severe_tocix','obscene','threat','insult','identity_hate']
    ans=[]
    for i in range(len(y)):
        if(y[i]==True):
            #print(l[i])
            ans.append(l[i])
    #x is the original predicted proba
    
    store=dict()
    store['probablity']=x
    store['predicted']=ans
    return store
    

@app.route('/check',methods=['GET'])
def hello():
    print('gi')
    x='COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK'
    class BERTClass(torch.nn.Module):
        def __init__(self):
            super(BERTClass, self).__init__()
            self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
            self.l2 = torch.nn.Dropout(0.3)
            self.l3 = torch.nn.Linear(768, 6)
        
        def forward(self, ids, mask, token_type_ids):
            _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
            output_2 = self.l2(output_1)
            output = self.l3(output_2)
            return output

    model = BERTClass()
    model.to(device)
    """data = request.json
    x=data['text']"""
    ans=query(model=model,text=x)

    return ans
        

    

@app.route('/')
def test():
    print('gi')
    return "hello world"
if __name__ == "__main__":
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [03/Nov/2020 17:15:14] "[37mGET / HTTP/1.1[0m" 200 -


gi


127.0.0.1 - - [03/Nov/2020 17:15:16] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -


gi


Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
127.0.0.1 - - [03/Nov/2020 17:15:23] "[37mGET /check HTTP/1.1[0m" 200 -
127.0.0.1 - - [03/Nov/2020 17:15:24] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -


gi


Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
127.0.0.1 - - [03/Nov/2020 17:36:29] "[37mGET /check HTTP/1.1[0m" 200 -
127.0.0.1 - - [03/Nov/2020 17:36:31] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -


In [None]:
l=['toxic','severe_tocix','obscene','threat','insult','identity_hate']