In [None]:
import pandas as pd
df_train = pd.read_table("/content/train.tsv")
df_test = pd.read_table("/content/test.tsv")

In [None]:
df_train.head()

In [None]:
df_train.columns

In [None]:
df_train.drop(['url', 'urlid', 'alchemy_category',
       'alchemy_category_score', 'avglinksize', 'commonlinkratio_1',
       'commonlinkratio_2', 'commonlinkratio_3', 'commonlinkratio_4',
       'compression_ratio', 'embed_ratio', 'framebased', 'frameTagRatio',
       'hasDomainLink', 'html_ratio', 'image_ratio', 'is_news',
       'lengthyLinkDomain', 'linkwordscore', 'news_front_page',
       'non_markup_alphanum_characters', 'numberOfLinks', 'numwords_in_url',
       'parametrizedLinkRatio', 'spelling_errors_ratio'],axis = 1, inplace = True)

In [None]:
import json
def boilerplate_pre(data):
    '''
    This will only take body of the article; if there is no body than we'll take the title 
    '''
    jsonData = json.loads(data)

    try:
        x = jsonData["body"]
        x = ' '.join(x.split(' ')[-400:])
    except:
        x = jsonData["title"]
    return x

In [None]:
df_train['boilerplate'] = df_train.boilerplate.map(boilerplate_pre)

In [None]:
df_train.head()

In [None]:
df_train['boilerplate'].isnull().any()

In [None]:
df_train.shape

In [None]:
df_train.dropna(inplace=True)

In [None]:
df_train['boilerplate'].isnull().any()

In [None]:
df_test['boilerplate'].isnull().any()

In [None]:
train_texts = list(df_train['boilerplate'])
train_labels = list(df_train['label'])
test_text = list(df_test["boilerplate"])

In [None]:
train_labels

In [None]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

In [None]:
!pip install transformers

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
train_encodings = tokenizer(train_texts,truncation=True,padding=True)
val_encodings = tokenizer(val_texts,truncation=True,padding=True)
test_encodings = tokenizer(test_text,truncation=True,padding=True)

In [None]:
train_encodings

In [None]:
import torch
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)

  



In [None]:
train_dataset[2]

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
model.to(device)

In [None]:


training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=5,  # batch size per device during training
    per_device_eval_batch_size=5,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
)



trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

In [None]:
trainer.evaluate()

In [None]:
test_input_ids = torch.tensor(test_encodings['input_ids'])

In [None]:
import numpy as np
predict__ = []
with torch.no_grad():
  for input in test_input_ids:
    preds = model(input.unsqueeze(0).to(device))
    preds = preds[0]
    preds = preds.detach().cpu().numpy()
    
    preds = np.argmax(preds, axis=1)
    # print(preds)
    predict__.append(preds)

In [None]:
predict__

In [None]:
outputs = []
for count,i in enumerate(predict__):
    for j in i:
        outputs.append(j)

In [None]:
sample_sub = pd.read_csv("/content/sampleSubmission (1).csv")

In [None]:
sample_sub.drop("label", axis = 1, inplace = True)

In [None]:
sample_sub["label"] = outputs

In [None]:
sample_sub.to_csv("/content/submission4_evergreen.csv", index=False)