In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LinearRegression, ARDRegression, RANSACRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

  from pandas import MultiIndex, Int64Index


In [2]:
df = pd.read_csv("cvpr_data_with_topics.csv")
df.head(1)

Unnamed: 0,title,authors,abstract,link,year,citation_count,doi,citation,title_open_cite,topic_class
0,Deformable Spatial Pyramid Matching for Fast D...,"Jaechul Kim, Ce Liu, Fei Sha, Kristen Grauman",We introduce a fast deformable spatial pyramid...,content_cvpr_2013/papers/Kim_Deformable_Spatia...,2013,120,10.1109/cvpr.2013.299,10.1007/978-3-030-01249-6_36; 10.2493/jjspe.84...,deformable spatial pyramid matching for fast d...,0


From temporal dynamics, we know the papers plateau after 1-3 years and the citation statistics below paint a similar picture

In [3]:
citation_stats = df[df["citation_count"] != -1].groupby("year")["citation_count"].describe()
citation_stats

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013,471.0,61.983015,113.848181,0.0,13.0,32.0,66.5,1531.0
2014,540.0,68.862963,297.099728,0.0,10.0,23.0,49.25,6032.0
2015,599.0,84.088481,475.864112,0.0,6.0,20.0,54.0,9377.0
2016,643.0,110.917574,920.885537,0.0,8.0,23.0,65.5,22369.0
2017,782.0,68.805627,258.558078,0.0,8.0,20.0,51.0,4870.0
2018,978.0,39.525562,105.193772,0.0,6.0,17.0,42.0,1982.0
2019,1294.0,19.833849,34.834792,0.0,5.0,10.0,22.0,639.0
2020,1320.0,12.496212,168.562193,0.0,2.0,4.0,8.0,6100.0
2021,1660.0,0.596988,1.690199,0.0,0.0,0.0,1.0,28.0


Drop null citations and normalize by the 50th percentile for each year

In [4]:
df = df[df["citation_count"] != -1] # drop null citations

In [5]:
year = 2013
ind = year - 2013
citation_stats.iloc[ind,:]["50%"]

32.0

In [6]:
df["citation_count"] = df.apply(lambda x: x["citation_count"] - citation_stats.iloc[x["year"]-2013,:]["50%"], axis=1)
citation_stats_norm = df.groupby("year")["citation_count"].describe()
citation_stats_norm

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013,471.0,29.983015,113.848181,-32.0,-19.0,0.0,34.5,1499.0
2014,540.0,45.862963,297.099728,-23.0,-13.0,0.0,26.25,6009.0
2015,599.0,64.088481,475.864112,-20.0,-14.0,0.0,34.0,9357.0
2016,643.0,87.917574,920.885537,-23.0,-15.0,0.0,42.5,22346.0
2017,782.0,48.805627,258.558078,-20.0,-12.0,0.0,31.0,4850.0
2018,978.0,22.525562,105.193772,-17.0,-11.0,0.0,25.0,1965.0
2019,1294.0,9.833849,34.834792,-10.0,-5.0,0.0,12.0,629.0
2020,1320.0,8.496212,168.562193,-4.0,-2.0,0.0,4.0,6096.0
2021,1660.0,0.596988,1.690199,0.0,0.0,0.0,1.0,28.0


50th percentile are very similar for [2013, 2017]. After that, papers likely did not have enough time to accrue citations. Discard the years 2020 and 2021.

In [7]:
df = df[~df["year"].isin([2020,2021])][["authors", "year", "citation_count", "topic_class"]] # drop 2020,2021 and only keep some of the columns
df.head(1)

Unnamed: 0,authors,year,citation_count,topic_class
0,"Jaechul Kim, Ce Liu, Fei Sha, Kristen Grauman",2013,88.0,0


Feature vector would be too large if we kept all the authors. Let's just keep the PI who is usually the last author:

In [8]:
df["authors"] = df["authors"].apply(lambda x: (x.split(","))[-1].strip().lower())
df.head(1)

Unnamed: 0,authors,year,citation_count,topic_class
0,kristen grauman,2013,88.0,0


Including all last authors would still give a very high dimensional vector, so we filter by total number of papers

In [9]:
upi = len(df["authors"].unique())
print(f"Total number of unique PIs {upi}")
cpi = df["authors"].value_counts(ascending=False)
pcount = 10
upi = len(cpi[cpi>pcount])
print(f"Number of PIs with more than {pcount} papers: {upi}") # PIs more than 10 would be very successful

Total number of unique PIs 1930
Number of PIs with more than 10 papers: 89


Convert authors with low citation counts to "other"

In [10]:
PI_filtered = cpi[cpi>10]
df["authors"] = df["authors"].apply(lambda x: x if x in PI_filtered else "other")

df.head(3)

Unnamed: 0,authors,year,citation_count,topic_class
0,kristen grauman,2013,88.0,0
1,other,2013,-6.0,0
2,other,2013,-13.0,0


In [11]:
len(df["authors"].unique())

90

One hot encoding

In [12]:
PI_onehot = pd.get_dummies(df.authors)
year_onehot = pd.get_dummies(df.year)
topics_onehot = pd.get_dummies(df.topic_class)

Train/Test Data and Split

In [13]:
X = pd.concat([PI_onehot, year_onehot, topics_onehot], axis="columns")
y = df.citation_count
print(X.shape)
print(y.shape)

(5307, 137)
(5307,)


Train/Test Split

In [14]:
xtrain, xtest, ytrain, ytest = train_test_split(X,y,test_size=0.2,random_state=12345)

Linear regression (which results in terrible predictions)

In [15]:
# ignore future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [16]:
reg = LinearRegression()
reg.fit(xtrain, ytrain)
reg.score(xtrain, ytrain)

0.06207443213140584

In [17]:
pred = reg.predict(xtest)
err = mean_squared_error(ytest,pred)
np.sqrt(err)

221.20077734561667

Other Sklearn models

In [20]:
models = {"linear_regression":LinearRegression(),
          "bayesian_ard": ARDRegression(),
          "ransac": RANSACRegressor(),
          "decision_tree":DecisionTreeRegressor(),
          "xgboost":XGBRegressor(booster="gbtree")}

skf = StratifiedKFold(n_splits=3)
for train_index, test_index in skf.split(X, y):
    for name, model in models.items():
        model.fit(X.iloc[train_index,:], y.iloc[train_index])
        pred = model.predict(X.iloc[test_index,:])
        err = mean_squared_error(y.iloc[test_index], pred)
        print(f"Model: {name} \t Sqrt(MSE): {np.sqrt(err)}")



Model: linear_regression 	 Sqrt(MSE): 612.1144924946556
Model: bayesian_ard 	 Sqrt(MSE): 612.2152643273787
Model: ransac 	 Sqrt(MSE): 2059050624979.5054
Model: decision_tree 	 Sqrt(MSE): 617.9439950155771
Model: xgboost 	 Sqrt(MSE): 614.8488000327764
Model: linear_regression 	 Sqrt(MSE): 236.98344468036595
Model: bayesian_ard 	 Sqrt(MSE): 233.50935622339478
Model: ransac 	 Sqrt(MSE): 114295463739523.1
Model: decision_tree 	 Sqrt(MSE): 259.7727106884873
Model: xgboost 	 Sqrt(MSE): 252.45807922942663
Model: linear_regression 	 Sqrt(MSE): 218.58450557608037
Model: bayesian_ard 	 Sqrt(MSE): 219.5582239335589
Model: ransac 	 Sqrt(MSE): 6212210148391.127
Model: decision_tree 	 Sqrt(MSE): 203.0014440829056
Model: xgboost 	 Sqrt(MSE): 182.25038587335305
