# Future Paper Success Prediction as Sentiment Analysis

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch 

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch.cuda.empty_cache()
device

'cuda:0'

Read data

In [2]:
df = pd.read_csv("cvpr_data_with_topics.csv")
df.head(1)

Unnamed: 0,title,authors,abstract,link,year,citation_count,doi,citation,title_open_cite,topic_class
0,Deformable Spatial Pyramid Matching for Fast D...,"Jaechul Kim, Ce Liu, Fei Sha, Kristen Grauman",We introduce a fast deformable spatial pyramid...,content_cvpr_2013/papers/Kim_Deformable_Spatia...,2013,120,10.1109/cvpr.2013.299,10.1007/978-3-030-01249-6_36; 10.2493/jjspe.84...,deformable spatial pyramid matching for fast d...,0


Drop null citations and the years 2022, 2021

In [3]:
df = df[df["citation_count"] != -1] # drop null citations
df = df[~df["year"].isin([2020,2021])][["year", "abstract", "citation_count"]] # drop 2020,2021 and only keep abstract + citaiton count


Citation statistics

In [4]:
citation_stats = df.groupby("year")["citation_count"].describe()
citation_stats

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013,471.0,61.983015,113.848181,0.0,13.0,32.0,66.5,1531.0
2014,540.0,68.862963,297.099728,0.0,10.0,23.0,49.25,6032.0
2015,599.0,84.088481,475.864112,0.0,6.0,20.0,54.0,9377.0
2016,643.0,110.917574,920.885537,0.0,8.0,23.0,65.5,22369.0
2017,782.0,68.805627,258.558078,0.0,8.0,20.0,51.0,4870.0
2018,978.0,39.525562,105.193772,0.0,6.0,17.0,42.0,1982.0
2019,1294.0,19.833849,34.834792,0.0,5.0,10.0,22.0,639.0


Normalize data across years by subtracting 50% and dividing 75%-25%

In [5]:
def norm_data(x):
    out = x["citation_count"] - citation_stats.iloc[x["year"]-2013,:]["50%"]
    width = citation_stats.iloc[x["year"]-2013,:]["75%"] - citation_stats.iloc[x["year"]-2013,:]["25%"]
    return out/width

df["citation_count"] = df.apply(norm_data, axis=1)
citation_stats_norm = df.groupby("year")["citation_count"].describe()
citation_stats_norm

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013,471.0,0.56043,2.128003,-0.598131,-0.35514,0.0,0.64486,28.018692
2014,540.0,1.168483,7.56942,-0.585987,-0.33121,0.0,0.66879,153.095541
2015,599.0,1.335177,9.913836,-0.416667,-0.291667,0.0,0.708333,194.9375
2016,643.0,1.529001,16.015401,-0.4,-0.26087,0.0,0.73913,388.626087
2017,782.0,1.135015,6.012979,-0.465116,-0.27907,0.0,0.72093,112.790698
2018,978.0,0.62571,2.922049,-0.472222,-0.305556,0.0,0.694444,54.583333
2019,1294.0,0.578462,2.049105,-0.588235,-0.294118,0.0,0.705882,37.0


Convert to range [0, 1] for sentiment analysis

In [6]:
def sigmoid(x, k=2.):
    return 1./(1. + np.exp(-k*x))

df["citation_count"] = df.apply(lambda x: sigmoid(x["citation_count"]), axis=1)
citation_stats_norm = df.groupby("year")["citation_count"].describe()
citation_stats_norm

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013,471.0,0.566162,0.256182,0.232141,0.329537,0.5,0.784083,1.0
2014,540.0,0.572268,0.254982,0.236498,0.340196,0.5,0.792068,1.0
2015,599.0,0.580983,0.245412,0.302941,0.358166,0.5,0.804815,1.0
2016,643.0,0.586692,0.242566,0.310026,0.372446,0.5,0.814295,1.0
2017,782.0,0.582581,0.249992,0.282878,0.363978,0.5,0.808743,1.0
2018,978.0,0.574139,0.242264,0.280003,0.351806,0.5,0.800415,1.0
2019,1294.0,0.578747,0.252662,0.235687,0.35704,0.5,0.804044,1.0


Drop the year which we don't need anymore

In [7]:
df = df.drop("year", axis=1)
df = df.rename(columns={"citation_count": "label"})
df.head(1)

Unnamed: 0,abstract,label
0,We introduce a fast deformable spatial pyramid...,0.964074


Convert pandas dataframe to huggingface dataset

In [8]:
from datasets import Dataset
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['abstract', 'label', '__index_level_0__'],
    num_rows: 5307
})

Load from pretrained language model

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_checkpoint = "distilbert-base-uncased-finetuned-cvpr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=1).to(device)

Some weights of the model checkpoint at distilbert-base-uncased-finetuned-cvpr were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-cvpr and are newly initialized: ['pre_classifier.weight', 

In [11]:
def tokenize_function(examples):
    result = tokenizer(examples["abstract"], padding="max_length", max_length=256, truncation=True)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = dataset.map(
    tokenize_function, batched=True, remove_columns=["abstract","__index_level_0__"]
)
tokenized_datasets



  0%|          | 0/6 [00:00<?, ?ba/s]

Dataset({
    features: ['label', 'input_ids', 'attention_mask', 'word_ids'],
    num_rows: 5307
})

Train/test split

In [12]:
train_size = 4307

downsampled_dataset = tokenized_datasets.train_test_split(
    train_size=train_size, seed=1234
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask', 'word_ids'],
        num_rows: 4307
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask', 'word_ids'],
        num_rows: 1000
    })
})

In [13]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Training set-up

In [14]:
from transformers import TrainingArguments

batch_size = 16
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir="distilbert-sentiment",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=5e-6,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=False,
    fp16=True,
    logging_steps=logging_steps,
    num_train_epochs = 10
)


Compute metrics from: https://huggingface.co/course/chapter3/3?fw=pt

In [15]:
from datasets import load_metric
 
def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [16]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Using amp half precision backend


In [17]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4307
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2700


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0646,0.050962,0.984,0.0
2,0.0537,0.050526,0.984,0.0
3,0.049,0.04942,0.984,0.0
4,0.0456,0.057304,0.984,0.0
5,0.0415,0.049912,0.984,0.0
6,0.0373,0.048333,0.984,0.0
7,0.0339,0.050275,0.984,0.0
8,0.0313,0.053158,0.984,0.0
9,0.0295,0.053851,0.984,0.0
10,0.0293,0.052973,0.984,0.0


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
Saving model checkpoint to distilbert-sentiment\checkpoint-500
Configuration saved in distilbert-sentiment\checkpoint-500\config.json
Model weights saved in distilbert-sentiment\checkpoint-500\pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `Dist

TrainOutput(global_step=2700, training_loss=0.041498704486423066, metrics={'train_runtime': 1538.0922, 'train_samples_per_second': 28.002, 'train_steps_per_second': 1.755, 'total_flos': 2852634556462080.0, 'train_loss': 0.041498704486423066, 'epoch': 10.0})

# F1 score does not improve with training. This could be possibly mitigated by stratified sampling however we don't have enough data