Basic Data Cleaning

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LinearRegression, ARDRegression, RANSACRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

import datetime
from utils import convert_to_datetime, impute_and_sort_citation

In [2]:
df = pd.read_csv("cvpr_data.csv")
df.head(1)

Unnamed: 0,source_title,reference,author,source_id,volume,title,year,doi,issue,citation_count,oa_link,citation,page,citation_date,abstract,pdf_link
0,2013 Ieee Conference On Computer Vision And Pa...,10.1007/s11263-006-0031-y; 10.1038/scientifica...,"Jia, Zhaoyin; Gallagher, Andrew; Saxena, Ashut...",,,"3d-based reasoning with blocks, support, and s...",2013,10.1109/cvpr.2013.8,,57,http://www.cs.cornell.edu/~asaxena/papers/rgbd...,10.1145/3215525.3215531; 10.1007/s10846-015-03...,,"[[2018, 5, 29], [2016, 3, 2], [2015, 1, 29], [...",3D volumetric reasoning is important for truly...,../../content_cvpr_2013/papers/Jia_3D-Based_Re...


In [3]:
df.isnull().sum()

source_title         0
reference         5098
author               0
source_id         8092
volume            8092
title                0
year                 0
doi                  0
issue             8092
citation_count       0
oa_link           2467
citation          1386
page              8092
citation_date        0
abstract             0
pdf_link             0
dtype: int64

For basic data analysis, only keep year, author list and citation count

In [4]:
# convert citation dates from string to datetime objects
df["citation_date"] = df["citation_date"].apply(convert_to_datetime)

# randomly impute missing citation dates and sort them
df["citation_date"] = df["citation_date"].apply(impute_and_sort_citation)

In [5]:
df = df.drop(df.columns.difference(['year','author','citation_date']),axis='columns')
df.head(1)

Unnamed: 0,author,year,citation_date
0,"Jia, Zhaoyin; Gallagher, Andrew; Saxena, Ashut...",2013,"[2013-10-10, 2014-01-06, 2014-03-07, 2014-08-0..."


In [6]:
df.isnull().sum()

author           0
year             0
citation_date    0
dtype: int64

Only keep first and last authors

In [7]:
df["first_author"] = df["author"].apply(lambda x: (x.split(";"))[0].strip().lower())
df["last_author"] = df["author"].apply(lambda x: (x.split(";"))[-1].strip().lower())
df = df.drop("author", axis="columns")
df.head(1)

Unnamed: 0,year,citation_date,first_author,last_author
0,2013,"[2013-10-10, 2014-01-06, 2014-03-07, 2014-08-0...","jia, zhaoyin","chen, tsuhan"


In [8]:
first_author = df['first_author'].value_counts(ascending=False)
last_author = df['last_author'].value_counts(ascending=False)
print(first_author)
print(last_author)

wu, yue                     8
wang, xiaolong              8
li, chen                    8
wang, peng                  8
feichtenhofer, christoph    7
                           ..
xu, jing                    1
wu, wenyan                  1
yang, yaoqing               1
marinoiu, elisabeta         1
wolf, valentin              1
Name: first_author, Length: 5715, dtype: int64
yang, ming-hsuan        58
tian, qi                47
van gool, luc           43
sun, jian               41
urtasun, raquel         41
                        ..
tuzel, oncel             1
blaschko, matthew b.     1
dong, ming               1
moeller, michael         1
buettner, florian        1
Name: last_author, Length: 2816, dtype: int64


In [9]:
print(len(first_author[first_author>3]))
print(len(last_author[last_author>10]))

174
142


Convert authors with low citation counts to "other"

In [10]:
first_author_filtered = first_author[first_author>3]
df["first_author"] = df["first_author"].apply(lambda x: x if x in first_author_filtered else "other")

last_author_filtered = last_author[last_author>3]
df["last_author"] = df["last_author"].apply(lambda x: x if x in last_author_filtered else "other")

df.head(3)

Unnamed: 0,year,citation_date,first_author,last_author
0,2013,"[2013-10-10, 2014-01-06, 2014-03-07, 2014-08-0...",other,"chen, tsuhan"
1,2013,"[2014-12-10, 2015-02-24, 2015-02-24]",other,"mori, greg"
2,2013,"[2014-04-18, 2014-09-30, 2014-10-14, 2014-10-3...",other,"chang, shih-fu"


Expand each row to multiple rows by using the citation date array

In [11]:
df_list = []
for i in range(len(df)):
    row = df.iloc[i,:]
    for j in range(1, len(row["citation_date"])):
        temp = {"year"                  :row["year"],
                "first_author"          :row["first_author"],
                "last_author"           :row["last_author"],
                "cur_citation_count"    :j,
                "time_to_next"          :row["citation_date"][j]-row["citation_date"][j-1]}
        df_list.append(temp)

In [12]:
print(len(df_list))
df_list = pd.DataFrame(df_list)
df_list.head()

308755


Unnamed: 0,year,first_author,last_author,cur_citation_count,time_to_next
0,2013,other,"chen, tsuhan",1,88 days
1,2013,other,"chen, tsuhan",2,60 days
2,2013,other,"chen, tsuhan",3,153 days
3,2013,other,"chen, tsuhan",4,6 days
4,2013,other,"chen, tsuhan",5,1 days


One hot encoding

In [13]:
first_author = pd.get_dummies(df_list.first_author)
last_author = pd.get_dummies(df_list.last_author)
year = pd.get_dummies(df_list.year)

Train/Test Data and Split

In [14]:
X = pd.concat([first_author, last_author, year, df_list["cur_citation_count"]], axis="columns")
y = df_list.time_to_next
print(X.head(1))
print(y.head(1))

   afifi, mahmoud  agudo, antonio  albl, cenek  barath, daniel  \
0               0               0            0               0   

   barron, jonathan t.  bernard, florian  bertasius, gedas  \
0                    0                 0                 0   

   bhattacharyya, apratim  bhunia, ayan kumar  bilen, hakan  ...  2013  2014  \
0                       0                   0             0  ...     1     0   

   2015  2016  2017  2018  2019  2020  2021  cur_citation_count  
0     0     0     0     0     0     0     0                   1  

[1 rows x 714 columns]
0   88 days
Name: time_to_next, dtype: timedelta64[ns]


In [15]:
X.isna().sum()

afifi, mahmoud         0
agudo, antonio         0
albl, cenek            0
barath, daniel         0
barron, jonathan t.    0
                      ..
2018                   0
2019                   0
2020                   0
2021                   0
cur_citation_count     0
Length: 714, dtype: int64

Train/Test Split

In [16]:
y = y.apply(lambda x: x.days) # use days as units not ns
y.head(3)

0     88
1     60
2    153
Name: time_to_next, dtype: int64

In [17]:
xtrain, xtest, ytrain, ytest = train_test_split(X,y,test_size=0.2,random_state=12345)
reg = LinearRegression()
reg.fit(xtrain, ytrain)
pred = reg.predict(xtest)
err = mean_squared_error(ytest,pred)



In [21]:
models = {"linear_regression":LinearRegression(),
          "bayesian_ard": ARDRegression(),
          "ransac": RANSACRegressor(),
          "decision_tree":DecisionTreeRegressor()}
results = []
skf = StratifiedKFold(n_splits=3)
for train_index, test_index in skf.split(X, y):
    for name, model in models.items():
        model.fit(X.iloc[train_index,:], y.iloc[train_index])
        pred = model.predict(X.iloc[test_index,:])
        err = mean_squared_error(y.iloc[test_index], pred)
        results.append({"model":name, "MSE": err})



In [22]:
results_df = pd.DataFrame(results)
print(results_df)

                model           MSE
0   linear_regression  2.197110e+05
1        bayesian_ard  2.198996e+05
2              ransac  2.052819e+04
3       decision_tree  1.242094e+05
4   linear_regression  1.151448e+11
5        bayesian_ard  5.587084e+03
6              ransac  1.735287e+12
7       decision_tree  2.013978e+04
8   linear_regression  6.113315e+04
9        bayesian_ard  5.778642e+04
10             ransac  2.840810e+13
11      decision_tree  9.335559e+03


In [30]:
reg = DecisionTreeRegressor()
reg.fit(xtrain, ytrain)



DecisionTreeRegressor()

Save Model and Author Names to File

In [31]:
# save model
import pickle
with open("model\\model.pickle", "wb") as f:
    pickle.dump(reg,f)

# save names
pd.DataFrame(first_author_filtered.index.to_list()).to_csv("model\\first_author.csv", index=False, header=False)
pd.DataFrame(last_author_filtered.index.to_list()).to_csv("model\\last_author.csv", index=False, header=False)