In [1]:
import torch
from torch import nn

In [2]:
import sys
sys.argv=['']
del sys
cols = ['ARTS', 'ARTS & CULTURE', 'BLACK VOICES', 'BUSINESS', 'COLLEGE', 'COMEDY', 'CRIME', 'CULTURE & ARTS', 'DIVORCE', 'EDUCATION', 'ENTERTAINMENT', 'ENVIRONMENT', 'FIFTY', 'FOOD & DRINK', 'GOOD NEWS', 'GREEN', 'HEALTHY LIVING', 'HOME & LIVING', 'IMPACT', 'LATINO VOICES', 'MEDIA', 'MONEY', 'PARENTING', 'PARENTS', 'POLITICS', 'QUEER VOICES', 'RELIGION', 'SCIENCE', 'SPORTS', 'STYLE', 'STYLE & BEAUTY', 'TASTE', 'TECH', 'TRAVEL', 'WEDDINGS', 'WEIRD NEWS', 'WELLNESS', 'WOMEN', 'WORLD NEWS', 'WORLDPOST']

In [3]:
from pytorch_pretrained_bert.tokenization import BertTokenizer, WordpieceTokenizer
from pytorch_pretrained_bert.modeling import BertForPreTraining,BertPreTrainedModel, BertModel, BertConfig, BertForMaskedLM, BertForSequenceClassification
from pathlib import Path
import torch
from torch import nn
from torch.nn import CrossEntropyLoss
import re
from torch import Tensor
import torch.nn.functional as F
#from fastai.text import Tokenizer, Vocab
import pandas as pd
import collections
import os
import pdb
from tqdm import tqdm, trange
import sys
import random
import numpy as np
import csv
from sklearn.model_selection import train_test_split
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import LabelBinarizer, label_binarize
import argparse
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from pytorch_pretrained_bert.optimization import BertAdam
from tqdm.auto import tqdm
import logging

In [4]:
parser = argparse.ArgumentParser(description='BERT Fine-Training')
parser.add_argument(
    '--fine_tune', '-ft', action='store_true', help='Fine-Tune on dataset')
parser.add_argument(
    '--resume', '-r', action='store_true', help='resume from checkpoint')
parser.add_argument(
    '--eval', '-e', action='store_true', help='Run eval on checkpoint')
parser.add_argument(
    '--predict', '-p', action='store_true', help='Run predictions on checkpoint')
arg = parser.parse_args()

if not os.path.exists('Data'):
    os.makedirs('Data')
if not os.path.exists('Data/output'):
    os.makedirs('Data/output')

#Read dataset
df3 = pd.read_csv('Data/cleaned.csv')

#Splitting data into train, test and val
X=df3.sample(frac=0.5,random_state=200)
test=df3.drop(X.index)
train=X.sample(frac=0.6,random_state=200)
val=X.drop(train.index)

print("Train Shape", train.shape)
print("Val Shape", val.shape)
print("Test Shape", test.shape)
train.to_csv('Data/train.csv',encoding='utf-8', index=False)
val.to_csv('Data/val.csv',encoding='utf-8', index=False)
test.to_csv('Data/test.csv',encoding='utf-8', index=False)

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

Train Shape (59974, 3)
Val Shape (39983, 3)
Test Shape (99957, 3)


In [5]:
DATA_PATH=Path('Data/')
DATA_PATH.mkdir(exist_ok=True)

CLAS_DATA_PATH=DATA_PATH/'output'
CLAS_DATA_PATH.mkdir(exist_ok=True)

model_state_dict = None
#Set this according to your dataset
input_col = 2 # 0 based index column input to the model
output_col = 1 # 0 based index column label to the model

BERT_PRETRAINED_PATH = Path('uncased_L-12_H-768_A-12')

In [6]:
args = {
    "train_size": -1,
    "val_size": -1,
    "full_data_dir": DATA_PATH,
    "data_dir": DATA_PATH,
    "task_name": "news_cat_label",
    "no_cuda": False,
    "bert_model": "bert-base-uncased",
    "output_dir": CLAS_DATA_PATH,
    "max_seq_length": 50,
    "do_train": True,
    "do_eval": True,
    "do_lower_case": True,
    "train_batch_size": 15,
    "eval_batch_size": 8,
    "learning_rate": 3e-5,
    "num_train_epochs": 4.0,
    "warmup_proportion": 0.1,
    "no_cuda": False,
    "local_rank": -1,
    "seed": 42,
    "gradient_accumulation_steps": 1,
    "optimize_on_cpu": False,
    "fp16": False,
    "loss_scale": 128
}
output_model_file = os.path.join(args['output_dir'], "ckpt.t7")

In [7]:
class BertForSequenceClassification(BertPreTrainedModel):
    """BERT model for classification.
    This module is composed of the BERT model with a linear layer on top of
    the pooled output.
    Params:
        `config`: a BertConfig class instance with the configuration to build a new model.
        `num_labels`: the number of classes for the classifier. Default = 2.
    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
            with indices selected in [0, ..., num_labels].
    Outputs:
        if `labels` is not `None`:
            Outputs the CrossEntropy classification loss of the output with the labels.
        if `labels` is `None`:
            Outputs the classification logits of shape [batch_size, num_labels].
    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
    num_labels = 2
    model = BertForSequenceClassification(config, num_labels)
    logits = model(input_ids, token_type_ids, input_mask)
    ```
    """
    def __init__(self, config, num_labels):
        super(BertForSequenceClassification, self).__init__(config)
        self.num_labels = num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        self.apply(self.init_bert_weights)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            return loss
        else:
            return logits
        
    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            print(param)
            param.requires_grad = False
    
    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True


In [8]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, labels=None):
        """Constructs a InputExample.
        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            labels: (Optional) [string]. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.labels = labels


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_ids):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_ids = label_ids

class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()
    
    def get_test_examples(self, data_dir, data_file_name, size=-1):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError() 

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

class LabelTextProcessor(DataProcessor):
    
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.labels = None
    
    
    def get_train_examples(self, data_dir, size=-1):
        filename = 'train.csv'
        logger.info("LOOKING AT {}".format(os.path.join(data_dir, filename)))
        if size == -1:
            data_df = pd.read_csv(os.path.join(data_dir, filename),engine=None)
            return self._create_examples(data_df, "train")
        else:
            data_df = pd.read_csv(os.path.join(data_dir, filename))
            return self._create_examples(data_df.sample(size), "train")
        
    def get_dev_examples(self, data_dir, size=-1):
        """See base class."""
        filename = 'val.csv'
        if size == -1:
            data_df = pd.read_csv(os.path.join(data_dir, filename))
            return self._create_examples(data_df, "dev")
        else:
            data_df = pd.read_csv(os.path.join(data_dir, filename))
            return self._create_examples(data_df.sample(size), "dev")
    
    def get_test_examples(self,msg, data_dir, data_file_name, size=-1):
        
        data = [['1', 'ENTERTAINMENT',msg]]
        #text2='7 get Covaxin booster shot as study begins on lifelong immunity'
        df1 = pd.DataFrame(data, columns = ['Unnamed', 'category','text'])
        if size == -1:
            return self._create_examples(df1, "test")
        else:
            return self._create_examples(df1.sample(size), "test")

    def get_labels(self):
        """See base class."""
        return ['ARTS', 'ARTS & CULTURE', 'BLACK VOICES', 'BUSINESS', 'COLLEGE', 'COMEDY', 'CRIME', 'CULTURE & ARTS', 'DIVORCE', 'EDUCATION', 'ENTERTAINMENT', 'ENVIRONMENT', 'FIFTY', 'FOOD & DRINK', 'GOOD NEWS', 'GREEN', 'HEALTHY LIVING', 'HOME & LIVING', 'IMPACT', 'LATINO VOICES', 'MEDIA', 'MONEY', 'PARENTING', 'PARENTS', 'POLITICS', 'QUEER VOICES', 'RELIGION', 'SCIENCE', 'SPORTS', 'STYLE', 'STYLE & BEAUTY', 'TASTE', 'TECH', 'TRAVEL', 'WEDDINGS', 'WEIRD NEWS', 'WELLNESS', 'WOMEN', 'WORLD NEWS', 'WORLDPOST']
        #return ["0", "1"]

    def _create_examples(self, df, set_type, labels_available=True):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, row) in enumerate(df.values):
            guid = "%s-%s" % (set_type, i)
            text_a = row[input_col]
            if labels_available:
                labels = row[output_col]
                #print('hiiiii',labels)
            else:
                labels = []
                print("No Label Found")
            examples.append(
                InputExample(guid=guid, text_a=text_a, labels=labels))
        return examples

def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    label_map = {label : i for i, label in enumerate(label_list)}
    
    features = []
    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        labels_ids = label_map[example.labels]
        if ex_index < 0:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("label: %s (id = %s)" % (example.labels, labels_ids))

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_ids=labels_ids))
    return features

def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()


In [9]:
def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

def accuracy_thresh(y_pred:Tensor, y_true:Tensor, thresh:float=0.5, sigmoid:bool=True):
    "Compute accuracy when `y_pred` and `y_true` are the same size."
    if sigmoid: y_pred = y_pred.sigmoid()
#     return ((y_pred>thresh)==y_true.byte()).float().mean().item()
    return np.mean(((y_pred>thresh)==y_true.byte()).float().cpu().numpy(), axis=1).sum()


def fbeta(y_pred:Tensor, y_true:Tensor, thresh:float=0.2, beta:float=2, eps:float=1e-9, sigmoid:bool=True):
    "Computes the f_beta between `preds` and `targets`"
    beta2 = beta ** 2
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = (y_pred>thresh).float()
    y_true = y_true.float()
    TP = (y_pred*y_true).sum(dim=1)
    prec = TP/(y_pred.sum(dim=1)+eps)
    rec = TP/(y_true.sum(dim=1)+eps)
    res = (prec*rec)/(prec*beta2+rec+eps)*(1+beta2)
    return res.mean().item()

"""## Training warmup"""

def warmup_linear(x, warmup=0.002):
    if x < warmup:
        return x/warmup
    return 1.0 - x

processors = {
    "news_cat_label": LabelTextProcessor
}

# Setup GPU parameters

if args["local_rank"] == -1 or args["no_cuda"]:
    device = torch.device("cuda" if torch.cuda.is_available() and not args["no_cuda"] else "cpu")
    n_gpu = torch.cuda.device_count()
#     n_gpu = 1
else:
    torch.cuda.set_device(args['local_rank'])
    device = torch.device("cuda", args['local_rank'])
    n_gpu = 1
    # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.distributed.init_process_group(backend='nccl')
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args['local_rank'] != -1), args['fp16']))

args['train_batch_size'] = int(args['train_batch_size'] / args['gradient_accumulation_steps'])

random.seed(args['seed'])
np.random.seed(args['seed'])
torch.manual_seed(args['seed'])
if n_gpu > 0:
    torch.cuda.manual_seed_all(args['seed'])

task_name = args['task_name'].lower()

if task_name not in processors:
    raise ValueError("Task not found: %s" % (task_name))

processor = processors[task_name](args['data_dir'])
label_list = processor.get_labels()
num_labels = len(label_list)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=args['do_lower_case'])




05/27/2021 10:05:28 - INFO - __main__ -   device: cuda n_gpu: 1, distributed training: False, 16-bits training: False
05/27/2021 10:05:29 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\jitesh\.pytorch_pretrained_bert\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [10]:
def get_model():
    if model_state_dict:
        model = BertForSequenceClassification.from_pretrained(args['bert_model'], num_labels = num_labels, state_dict=model_state_dict)
    else:
        model = BertForSequenceClassification.from_pretrained(args['bert_model'], num_labels = num_labels)
    return model

model = get_model()
model.to(device)

05/27/2021 10:05:35 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at C:\Users\jitesh\.pytorch_pretrained_bert\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
05/27/2021 10:05:35 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file C:\Users\jitesh\.pytorch_pretrained_bert\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir C:\Users\jitesh\AppData\Local\Temp\tmpbgyu7jgs
05/27/2021 10:05:40 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

In [11]:
import torch.nn.functional as F
def predict(model, path,msg, test_filename='test1.csv'):
    predict_processor = LabelTextProcessor(path)
    print(msg)
    test_examples = predict_processor.get_test_examples(msg,path, test_filename, size=-1)
    
    print(test_examples)
    # Hold input data for returning it 
    input_data = [{ 'id': input_example.guid, 'comment_text': input_example.text_a } for input_example in test_examples]

    test_features = convert_examples_to_features(
        test_examples, label_list, args['max_seq_length'], tokenizer)
    print(test_examples)
    print(args['max_seq_length'])
    
    logger.info("***** Running prediction *****")
    logger.info("  Num examples = %d", len(test_examples))
    logger.info("  Batch size = %d", args['eval_batch_size'])
    
    all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long)
    #print(all_input_mask)
    test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)
    
    # Run prediction for full data
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args['eval_batch_size'])
    
    all_logits = None
    
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for step, batch in enumerate(tqdm(test_dataloader, desc="Prediction Iteration")):
        input_ids, input_mask, segment_ids = batch
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)

        with torch.no_grad():
            logits = model(input_ids, segment_ids, input_mask)
            logits = F.softmax(logits, -1)
            #print(logits)

        if all_logits is None:
            all_logits = logits.detach().cpu().numpy()
        else:
            all_logits = np.concatenate((all_logits, logits.detach().cpu().numpy()), axis=0)
            
        nb_eval_examples += input_ids.size(0)
        nb_eval_steps += 1

    return pd.merge(pd.DataFrame(input_data), pd.DataFrame(all_logits, columns=label_list), left_index=True, right_index=True)

In [12]:
checkpoint = torch.load(output_model_file)


In [13]:
model.load_state_dict(checkpoint['state_dict'])

<All keys matched successfully>

In [22]:
msg="Brazil health regulator rejects Russia's Sputnik vaccine"

In [23]:
result = predict(model, DATA_PATH,msg)

Brazil health regulator rejects Russia's Sputnik vaccine
[<__main__.InputExample object at 0x0000028601DAA208>]
[<__main__.InputExample object at 0x0000028601DAA208>]
50


04/27/2021 15:54:11 - INFO - __main__ -   ***** Running prediction *****
04/27/2021 15:54:11 - INFO - __main__ -     Num examples = 1
04/27/2021 15:54:11 - INFO - __main__ -     Batch size = 8


HBox(children=(IntProgress(value=0, description='Prediction Iteration', max=1, style=ProgressStyle(description…




In [24]:
a=result[cols]
b=a.idxmax(axis=1, skipna=True)
b[0]

'WORLDPOST'

In [25]:
#a

Unnamed: 0,ARTS,ARTS & CULTURE,BLACK VOICES,BUSINESS,COLLEGE,COMEDY,CRIME,CULTURE & ARTS,DIVORCE,EDUCATION,...,STYLE & BEAUTY,TASTE,TECH,TRAVEL,WEDDINGS,WEIRD NEWS,WELLNESS,WOMEN,WORLD NEWS,WORLDPOST
0,0.000636,0.001057,0.000484,0.004217,0.000268,0.001465,0.001049,8.7e-05,0.000561,0.000502,...,6e-05,0.001148,0.001574,0.000573,0.000167,0.000726,0.007868,0.00113,0.030361,0.699646


In [98]:
#path=DATA_PATH

In [100]:
#a

[<__main__.InputExample at 0x2240baac6c8>]

In [92]:
#test_filename='test1.csv'
#data_file_name=test_filename

In [48]:
#msg='7 get Covaxin booster shot as study begins on lifelong immunity'

In [18]:
'''def get_text():
    text3=text2
    return text3'''

In [19]:
'''def _create_examples(self, df, set_type, labels_available=True):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, row) in enumerate(df.values):
            guid = "%s-%s" % (set_type, i)
            text_a = row[input_col]
            if labels_available:
                labels = row[output_col]
                #print('hiiiii',labels)
            else:
                labels = []
                print("No Label Found")
            examples.append(
                InputExample(guid=guid, text_a=text_a, labels=labels))
        return examples'''

In [102]:
'''def get_test_examples(self,msg, data_dir, data_file_name, size=-1):
    data = [['1', 'ENTERTAINMENT',msg],['2', 'ENTERTAINMENT',msg]]
       
    df1 = pd.DataFrame(data, columns = ['Unnamed', 'category','text'])
    print(df1)
    if size == -1:
        return self._create_examples(df1, "test")
    else:
        return self._create_examples(df1.sample(size), "test")'''

In [20]:
#data_df = pd.read_csv(os.path.join(DATA_PATH, 'test1.csv'))

In [58]:
#data

NameError: name 'data' is not defined

In [49]:
#text1='hi i am jitesh'

In [54]:
#df1 = pd.DataFrame(data, columns = ['Unnamed', 'category','text'])

In [14]:
cols = ['ARTS', 'ARTS & CULTURE', 'BLACK VOICES', 'BUSINESS', 'COLLEGE', 'COMEDY', 'CRIME', 'CULTURE & ARTS', 'DIVORCE', 'EDUCATION', 'ENTERTAINMENT', 'ENVIRONMENT', 'FIFTY', 'FOOD & DRINK', 'GOOD NEWS', 'GREEN', 'HEALTHY LIVING', 'HOME & LIVING', 'IMPACT', 'LATINO VOICES', 'MEDIA', 'MONEY', 'PARENTING', 'PARENTS', 'POLITICS', 'QUEER VOICES', 'RELIGION', 'SCIENCE', 'SPORTS', 'STYLE', 'STYLE & BEAUTY', 'TASTE', 'TECH', 'TRAVEL', 'WEDDINGS', 'WEIRD NEWS', 'WELLNESS', 'WOMEN', 'WORLD NEWS', 'WORLDPOST']

In [59]:
#a=result[cols]

In [42]:
get_test_examples()

TypeError: get_test_examples() missing 3 required positional arguments: 'self', 'data_dir', and 'data_file_name'

In [61]:
#b=a.idxmax(axis=1, skipna=True)

In [1]:
message

NameError: name 'message' is not defined

In [14]:
from flask import Flask, redirect, url_for, render_template, request, session,jsonify

In [15]:
from werkzeug.wrappers import Request, Response

In [None]:
app = Flask(__name__, template_folder='./')
app.static_folder = 'static'

@app.route('/prediction', methods=['POST', 'GET'])
def prediction():
    if request.method == "POST":
        message = request.form['message']
        
        
        
        result = predict(model, DATA_PATH,message)
        a=result[cols]
        b=a.idxmax(axis=1, skipna=True)
        b[0]
        response =  b[0]
        print(response)
        msg="The news category is"
        return jsonify(msg,response)
    
    return jsonify("Input text")

@app.route('/')
def main():
    return render_template('index2.html')


if __name__ == '__main__':
    #app.run(debug=True)
    from werkzeug.serving import run_simple
    run_simple('localhost', 5000, app)
    
@app.route('/shutdown', methods=['POST'])
def shutdown():
    shutdown_server()
    return 'Server shutting down...'


05/27/2021 10:06:22 - INFO - werkzeug -    * Running on http://localhost:5000/ (Press CTRL+C to quit)
05/27/2021 10:06:28 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 10:06:28] "GET / HTTP/1.1" 200 -
05/27/2021 10:06:28 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 10:06:28] "GET /style.css HTTP/1.1" 404 -
05/27/2021 10:06:29 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 10:06:29] "GET /favicon.ico HTTP/1.1" 404 -
05/27/2021 10:07:37 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 10:07:37] "GET / HTTP/1.1" 200 -
05/27/2021 10:07:37 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 10:07:37] "GET /style.css HTTP/1.1" 404 -


Haryana: 84-yr-old covid patient treated with antibody cocktail, discharged
[<__main__.InputExample object at 0x000001F20DDE0E88>]
[<__main__.InputExample object at 0x000001F20DDE0E88>]
50


05/27/2021 10:07:40 - INFO - __main__ -   ***** Running prediction *****
05/27/2021 10:07:40 - INFO - __main__ -     Num examples = 1
05/27/2021 10:07:40 - INFO - __main__ -     Batch size = 8


HBox(children=(IntProgress(value=0, description='Prediction Iteration', max=1, style=ProgressStyle(description…


HEALTHY LIVING


05/27/2021 10:07:42 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 10:07:42] "POST /prediction HTTP/1.1" 200 -
05/27/2021 10:08:00 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 10:08:00] "GET / HTTP/1.1" 200 -
05/27/2021 10:08:00 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 10:08:00] "GET /style.css HTTP/1.1" 404 -


Mehul Choksi arrested in Dominica, family says relieved he is safe
[<__main__.InputExample object at 0x000001F20DD94688>]
[<__main__.InputExample object at 0x000001F20DD94688>]
50


05/27/2021 10:08:03 - INFO - __main__ -   ***** Running prediction *****
05/27/2021 10:08:03 - INFO - __main__ -     Num examples = 1
05/27/2021 10:08:03 - INFO - __main__ -     Batch size = 8


HBox(children=(IntProgress(value=0, description='Prediction Iteration', max=1, style=ProgressStyle(description…


CRIME


05/27/2021 10:08:03 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 10:08:03] "POST /prediction HTTP/1.1" 200 -
05/27/2021 10:08:29 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 10:08:29] "GET / HTTP/1.1" 200 -
05/27/2021 10:08:29 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 10:08:29] "GET /style.css HTTP/1.1" 404 -


Guv nod to Haryana Recovery of Damages to Property Act
[<__main__.InputExample object at 0x000001F20DE1D5C8>]
[<__main__.InputExample object at 0x000001F20DE1D5C8>]
50


05/27/2021 10:08:31 - INFO - __main__ -   ***** Running prediction *****
05/27/2021 10:08:31 - INFO - __main__ -     Num examples = 1
05/27/2021 10:08:31 - INFO - __main__ -     Batch size = 8


HBox(children=(IntProgress(value=0, description='Prediction Iteration', max=1, style=ProgressStyle(description…


WORLD NEWS


05/27/2021 10:08:31 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 10:08:31] "POST /prediction HTTP/1.1" 200 -
05/27/2021 10:27:12 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 10:27:12] "GET / HTTP/1.1" 200 -
05/27/2021 10:27:12 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 10:27:12] "GET /style.css HTTP/1.1" 404 -


At auto driver’s home of hope for elderly in Bengaluru, vaccination is a stumbling block
[<__main__.InputExample object at 0x000001F20DE2CC08>]
[<__main__.InputExample object at 0x000001F20DE2CC08>]
50


05/27/2021 10:27:15 - INFO - __main__ -   ***** Running prediction *****
05/27/2021 10:27:15 - INFO - __main__ -     Num examples = 1
05/27/2021 10:27:15 - INFO - __main__ -     Batch size = 8


HBox(children=(IntProgress(value=0, description='Prediction Iteration', max=1, style=ProgressStyle(description…


IMPACT


05/27/2021 10:27:15 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 10:27:15] "POST /prediction HTTP/1.1" 200 -
05/27/2021 10:27:38 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 10:27:38] "GET / HTTP/1.1" 200 -
05/27/2021 10:27:38 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 10:27:38] "GET /style.css HTTP/1.1" 404 -


LA Mayor Eric Garcetti is Joe Biden’s choice for India envoy, says report
[<__main__.InputExample object at 0x000001F20DE0FF48>]
[<__main__.InputExample object at 0x000001F20DE0FF48>]
50


05/27/2021 10:27:41 - INFO - __main__ -   ***** Running prediction *****
05/27/2021 10:27:41 - INFO - __main__ -     Num examples = 1
05/27/2021 10:27:41 - INFO - __main__ -     Batch size = 8


HBox(children=(IntProgress(value=0, description='Prediction Iteration', max=1, style=ProgressStyle(description…


POLITICS


05/27/2021 10:27:41 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 10:27:41] "POST /prediction HTTP/1.1" 200 -
05/27/2021 10:56:21 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 10:56:21] "GET / HTTP/1.1" 200 -
05/27/2021 10:56:21 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 10:56:21] "GET /style.css HTTP/1.1" 404 -


Amitabh Bachchan 'Hosts' KBC on Instagram, Asks Fans to Guess His Popular Bollywood Movie
[<__main__.InputExample object at 0x000001F20DFED388>]
[<__main__.InputExample object at 0x000001F20DFED388>]
50


05/27/2021 10:56:24 - INFO - __main__ -   ***** Running prediction *****
05/27/2021 10:56:24 - INFO - __main__ -     Num examples = 1
05/27/2021 10:56:24 - INFO - __main__ -     Batch size = 8


HBox(children=(IntProgress(value=0, description='Prediction Iteration', max=1, style=ProgressStyle(description…


ENTERTAINMENT


05/27/2021 10:56:24 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 10:56:24] "POST /prediction HTTP/1.1" 200 -
05/27/2021 11:40:45 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 11:40:45] "GET / HTTP/1.1" 200 -
05/27/2021 11:40:45 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 11:40:45] "GET /style.css HTTP/1.1" 404 -


Mehul Choksi arrested in Dominica, family says relieved he is safe
[<__main__.InputExample object at 0x000001F20DE1D0C8>]
[<__main__.InputExample object at 0x000001F20DE1D0C8>]
50


05/27/2021 11:40:48 - INFO - __main__ -   ***** Running prediction *****
05/27/2021 11:40:48 - INFO - __main__ -     Num examples = 1
05/27/2021 11:40:48 - INFO - __main__ -     Batch size = 8


HBox(children=(IntProgress(value=0, description='Prediction Iteration', max=1, style=ProgressStyle(description…


CRIME


05/27/2021 11:40:48 - INFO - werkzeug -   127.0.0.1 - - [27/May/2021 11:40:48] "POST /prediction HTTP/1.1" 200 -


In [35]:
b

0    WORLDPOST
dtype: object

In [38]:
text2

'Kejriwal spending crores on publicity but doing nothing to augment oxygen supply in Delhi: Ajay Maken'