### Python Libraries

In [1]:
!pip install simpletransformers
import json
import re
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk
import plotly.offline as py
import plotly.graph_objs as go
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
py.init_notebook_mode(connected=True)
from sklearn import preprocessing
import keras
from simpletransformers.classification import ClassificationModel
import sklearn.metrics
import torch
from sklearn.model_selection import GridSearchCV
from collections import defaultdict
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import *
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from keras.layers import *
from keras.callbacks import *
from keras.models import load_model
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.utils import shuffle
from nltk.tokenize import word_tokenize
nltk.download('punkt')

Collecting simpletransformers
[?25l  Downloading https://files.pythonhosted.org/packages/57/30/9301fa1d358a2ee120767ac7e9a23fcf46a9b275c4dbc66351bea1ca86c7/simpletransformers-0.28.0-py3-none-any.whl (184kB)
[K     |█▊                              | 10kB 21.9MB/s eta 0:00:01[K     |███▌                            | 20kB 3.2MB/s eta 0:00:01[K     |█████▎                          | 30kB 4.3MB/s eta 0:00:01[K     |███████                         | 40kB 3.0MB/s eta 0:00:01[K     |████████▉                       | 51kB 3.4MB/s eta 0:00:01[K     |██████████▋                     | 61kB 4.1MB/s eta 0:00:01[K     |████████████▍                   | 71kB 4.4MB/s eta 0:00:01[K     |██████████████▏                 | 81kB 4.4MB/s eta 0:00:01[K     |████████████████                | 92kB 5.0MB/s eta 0:00:01[K     |█████████████████▊              | 102kB 5.0MB/s eta 0:00:01[K     |███████████████████▌            | 112kB 5.0MB/s eta 0:00:01[K     |█████████████████████▎         

Using TensorFlow backend.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Importing datasets 

In [0]:
def import_data(data):
    X_false = []
    y = []
    for value in list(data.values()):
        text = value['text']
        label = value['label']
        X_false.append(text)
        y.append(label)
    X = pd.DataFrame(X_false,columns=['article'])
    return X, y

def get_train_data():
    with open('project-files/train.json') as file:
        data = json.load(file)
    return data

def get_test_data():
    with open('project-files/test-unlabelled.json') as file:
        data = json.load(file)
    test_data = []
    for value in list(data.values()):
        text = value['text']
        test_data.append(text)
    X = pd.DataFrame(test_data,columns=['article'])
    return X

def get_dev_data():
    with open('project-files/dev.json') as file:
        data = json.load(file)
    return data

### Preprocessing datasets 

In [0]:
def convert_contractions(word):
    contractions = { 
    "aint": "am not",
    "arent": "are not",
    "cant": "cannot",
    "cantve": "cannot have",
    "cause": "because",
    "couldve": "could have",
    "couldnt": "could not",
    "couldntve": "could not have",
    "didnt": "did not",
    "doesnt": "does not",
    "dont": "do not",
    "hadnt": "had not",
    "hadntve": "had not have",
    "hasnt": "has not",
    "havent": "have not",
    "hed": "he would",
    "hedve": "he would have",
    "hell": "he will",
    "hellve": "he will have",
    "hes": "he is",
    "howd": "how did",
    "howdy": "how do you",
    "howll": "how will",
    "hows": "how is",
    "Id": "I would",
    "Idve": "I would have",
    "Ill": "I will",
    "Illve": "I will have",
    "Im": "I am",
    "Ive": "I have",
    "isnt": "is not",
    "itd": "it would",
    "itdve": "it would have",
    "itll": "it will",
    "itllve": "it will have",
    "its": "it is",
    "lets": "let us",
    "maam": "madam",
    "maynt": "may not",
    "mightve": "might have",
    "mightnt": "might not",
    "mightntve": "might not have",
    "mustve": "must have",
    "mustnt": "must not",
    "mustntve": "must not have",
    "neednt": "need not",
    "needntve": "need not have",
    "oclock": "of the clock",
    "oughtnt": "ought not",
    "oughtntve": "ought not have",
    "shant": "shall not",
    "shant": "shall not",
    "shantve": "shall not have",
    "shed": "she would",
    "shedve": "she would have",
    "shell": "she will",
    "shellve": "she will have",
    "shes": "she is",
    "shouldve": "should have",
    "shouldnt": "should not",
    "shouldntve": "should not have",
    "sove": "so have",
    "sos": "so is",
    "thatd": "that would",
    "thatdve": "that would have",
    "thats": "that is",
    "thered": "there would",
    "theredve": "there would have",
    "theres": "there is",
    "theyd": "they would",
    "theydve": "they would have",
    "theyll": "they will",
    "theyllve": "they will have",
    "theyre": "they are",
    "theyve": "they have",
    "tove": "to have",
    "wasnt": "was not",
    "wed": "we would",
    "wedve": "we would have",
    "well": "we will",
    "wellve": "we will have",
    "were": "we are",
    "weve": "we have",
    "werent": "were not",
    "whatll": "what will",
    "whatllve": "what will have",
    "whatre": "what are",
    "whats": "what is",
    "whatve": "what have",
    "whens": "when is",
    "whenve": "when have",
    "whered": "where did",
    "wheres": "where is",
    "whereve": "where have",
    "wholl": "who will",
    "whollve": "who will have",
    "whos": "who is",
    "whove": "who have",
    "whys": "why is",
    "whyve": "why have",
    "willve": "will have",
    "wont": "will not",
    "wontve": "will not have",
    "wouldve": "would have",
    "wouldnt": "would not",
    "wouldntve": "would not have",
    "yall": "you all",
    "yalld": "you all would",
    "yalldve": "you all would have",
    "yallre": "you all are",
    "yallve": "you all have",
    "youd": "you would",
    "youdve": "you would have",
    "youll": "you will",
    "youllve": "you will have",
    "youre": "you are",
    "youve": "you have"
    }
    if (word in contractions.keys()):
      word = word.replace(' ‘ ','\'')
      return contractions[word]
    else:
      return word

def clean_contraction(sentence):   
    sentence = sentence.replace('\'','')
    sentence = sentence.replace(' \' ','')
    sentence = sentence.replace('‘','') 
    sentence = sentence.replace(' ‘','') 
    new_sentence = []
    words = word_tokenize(sentence)
    for word in words:
        contraction = convert_contractions(word)
        new_sentence.append(contraction)
    return " ".join(new_sentence)

def get_positive_label_articles():
    X_pos, y_pos = import_data(get_train_data())
    X_pos = X_pos['article'] 
    return X_pos, y_pos

def get_negative_label_articles():
    articles_cc = pd.read_json('articles/articles_preprocessed_cc.json')
    articles_not_cc = pd.read_json('articles/articles_preprocessed_not_cc.json')
    articles_cc = shuffle(articles_cc, random_state = 42)
    articles_not_cc = shuffle(articles_not_cc, random_state = 42)
    articles = pd.concat([articles_cc, articles_not_cc])
    y = np.zeros(len(articles))   
    articles = articles['article']
    return articles, y

def preprocess_data():
    X_pos, y_pos = get_positive_label_articles()
    X_neg, y_neg = get_negative_label_articles()
    X = pd.concat([X_pos, X_neg])
    X = X.apply(lambda x: clean_contraction(x))
    X = X.str.replace('http\S+|www.\S+', '', case=False)
    y = np.concatenate((y_pos, y_neg), axis=None)    
    return X, y

def preprocess_dev_data():
    X_dev, y_dev = import_data(get_dev_data())
    X_dev = X_dev['article']
    X_dev = X_dev.str.replace('http\S+|www.\S+', '', case=False)
    return X_dev, y_dev

def preprocess_codalab_test_data():
    test_kaggle = get_test_data()
    test_kaggle = test_kaggle['article']
    return test_kaggle
    
def create_submission_json(name, predictions):
    count = 0
    prediction_dict = defaultdict(dict)
    label_list = []
    for prediction in predictions:
        prediction_dict['test-'+ str(count)]['label'] = int(prediction)      
        count = count + 1
    filename = name +'-test-output.json'
    with open(filename, 'w') as f:
        json.dump(prediction_dict, f) 
    print('File created sucessfully!')


### Making predictions

In [4]:
def make_prediction():
    X, y = preprocess_data()   
    y = y.astype(int)
    df_X = pd.DataFrame(X)
    df_X.reset_index(drop=True, inplace=True)
    df_y = pd.DataFrame(y)
    X_final = pd.concat([df_X , df_y], axis = 1)

    X_dev, y_dev = preprocess_dev_data()
    df_X_dev = pd.DataFrame(X_dev)
    df_y_dev = pd.DataFrame(y_dev)
    X_dev_final = pd.concat([df_X_dev , df_y_dev], axis = 1)

    X_codalab = preprocess_codalab_test_data()
    print('*'*100)
    print('Training Set')
    print('*'*100)
    print(X_final.head())
    print('*'*100)
    print('Development Set')
    print('*'*100)
    print(X_dev.head())
    print('*'*100)
    print('Test Set')
    print('*'*100)
    print(X_codalab.head())
    print('*'*100)
    print('Pre-trained model started ...')
    print('*'*100)
    # True if there is a GPU available to use, otherwise False
    cuda =  torch.cuda.is_available()

    # Hyperparameter tuning
    learning_rate = 1e-05
    adam_epsilon =  1e-04 
    weight_decay = 0.1

    max_seq_length = 256 
    train_batch_size = 8 
    eval_batch_size = 8  
    num_train_epochs = 30

    # Simple Transformers settings
    train_args={
        'max_seq_length': max_seq_length,
        'overwrite_output_dir': True,
        'num_train_epochs': num_train_epochs,
        'train_batch_size': train_batch_size,

        'sliding_window': True,
        'stride': 0.8,

        'fp16': False,
        'weight_decay': weight_decay,
        'learning_rate': learning_rate,
        'adam_epsilon' : adam_epsilon,

        'no_cache': True,
        "save_eval_checkpoints": False,

        "best_model_dir": "outputs/",
        'use_early_stopping': True,
        'early_stopping_delta': 0, 
        "early_stopping_patience": 3,
        "early_stopping_metric": "eval_loss",
        "early_stopping_metric_minimize": True,

        "save_steps": 35000,
        "no_cache": True,
        "save_model_every_epoch": False, 

        "manual_seed": 77,
    }

    model = ClassificationModel('roberta', 'roberta-large-openai-detector', weight=[0.5, 0.5],  args=train_args, use_cuda = cuda)

    model.train_model(X_final)

    model = ClassificationModel('roberta', 'outputs/', args=train_args)

    result, model_outputs, wrong_predictions = model.eval_model(X_dev_final, acc= accuracy_score)
    print(result)

    dev_predictions, dev_raw_outputs = model.predict(X_dev)
    print('*'*100)
    print('Weight decay:',weight_decay)
    print('Adam:',adam_epsilon)
    print('Learning rate:',learning_rate)
    print('Accuracy:', accuracy_score(y_dev, dev_predictions))
    print('F1 Score:',f1_score(y_dev, dev_predictions))
    print('Recall:', recall_score(y_dev, dev_predictions))
    print('Precision:',precision_score(y_dev, dev_predictions))

    predictions, raw_outputs = model.predict(X_codalab)
    print(predictions)
    create_submission_json('roberta-version-', predictions)

make_prediction()

****************************************************************************************************
Training Set
****************************************************************************************************
                                             article  0
0  why houston flooding is not a sign of climate ...  1
1  The U.N. Intergovernmental Panel on Climate Ch...  1
2  Bureau Now Sets Strict Limits on Cooling OVER ...  1
3  The Dirty Extractive Underbelly of Clean Energ...  1
4  why climate change seems to have faded from th...  1
****************************************************************************************************
Development Set
****************************************************************************************************
0    Are Climate Models Overpredicting Global Warmi...
1    The latest National Climate Assessment, releas...
2    Climate Strike Kids Cool on Real Action\nA pop...
3    Morrison a ‘predatory’ centrist on climate pol...
4    CNN’s 

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=519.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1425743305.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, max=3504.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=30.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.001042


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.131361


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000090


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000026


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000027


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000016


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000007


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000005


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000007


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000003


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000004


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.671015


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000002


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000002


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000002


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000001


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000001


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000002


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000002


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000001


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000002


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000001


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000001


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000001


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000002


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000001


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000001


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000001


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000001


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1976.0, style=ProgressStyle(descr…

Running loss: 0.000001



HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=68.0), HTML(value='')))


{'mcc': 0.7814078022083395, 'tp': 43, 'tn': 46, 'fp': 4, 'fn': 7, 'acc': 0.89, 'eval_loss': 1.4019753230626093}


HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=68.0), HTML(value='')))


****************************************************************************************************
Weight decay: 0.1
Adam: 0.0001
Learning rate: 1e-05
Accuracy: 0.89
F1 Score: 0.8865979381443299
Recall: 0.86
Precision: 0.9148936170212766


HBox(children=(FloatProgress(value=0.0, max=1410.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=862.0), HTML(value='')))


[1 0 0 ... 0 1 0]
File created sucessfully!
