In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

eng_stopwords = set(stopwords.words("english"))
pd.options.mode.chained_assignment = None

In [5]:
train_df = pd.read_json("data/train.json")
test_df = pd.read_json("data/test.json")
print("Number of rows in train dataset : ",train_df.shape[0])
print("Number of rows in test dataset : ",test_df.shape[0])


Number of rows in train dataset :  12129
Number of rows in test dataset :  6531


In [6]:
train_df.head()

Unnamed: 0,paperId,title,authorId,authorName,abstract,year,venue
0,0b341b6938308a6d5f47edf490f6e46eae3835fa,Detecting linguistic idiosyncratic interests i...,3188285,Masoud Rouhizadeh,Children with autism spectrum disorder often e...,2014,CLPsych@ACL
1,c682727ee058aadbe9dbf838dcb036322818f588,Bigrams and BiLSTMs Two Neural Networks for Se...,2782720,Yuri Bizzoni,We present and compare two alternative deep ne...,2018,Fig-Lang@NAACL-HLT
2,0f9b5b32229a7245e43754430c0c88f8e7f0d8af,In Factuality: Efficient Integration of Releva...,144748442,Peter Vickers,Visual Question Answering (VQA) methods aim at...,2021,ACL
3,7e8b4cfdc03b59ece2d6b33a217f0abd47f708d9,Variational Graph Autoencoding as Cheap Superv...,46331602,Irene Li,Coreference resolution over semantic graphs li...,2022,ACL
4,07588dd5d0252c7abc99b3834a81bf23741ead4b,LIMIT-BERT : Linguistics Informed Multi-Task BERT,30887404,Junru Zhou,"In this paper, we present Linguistics Informed...",2019,FINDINGS


In [14]:
# Check for missing values
train_df.isna().any().any()

False

In [33]:
train_df.dtypes

paperId       object
title         object
authorId       int64
authorName    object
abstract      object
year           int64
venue         object
dtype: object

In [31]:
# check author stats
train_df["authorId"].value_counts()

1750769       13
1747849       13
51042088      12
2854981       12
3422953       11
              ..
40192974       1
2013172        1
2106294609     1
5677323        1
144928136      1
Name: authorId, Length: 5625, dtype: int64

5625 unique authors in the dataset, thats a lot

In [15]:
train_df["venue"].value_counts()

ACL                                                                          2860
EMNLP                                                                        2247
NAACL                                                                        1023
*SEMEVAL                                                                      470
FINDINGS                                                                      399
                                                                             ... 
DLG4NLP                                                                         1
SCAI                                                                            1
ACL 2020                                                                        1
bioRxiv                                                                         1
Proceedings of the Workshop on Generalization in the Age of Deep Learning       1
Name: venue, Length: 362, dtype: int64

## Check for correlations

In [43]:
from dython.nominal import associations

df = train_df.apply(
    lambda x: x.astype("object") if x.dtype == "int64" else x)
df = df[[i for i in df.columns if i in ('authorId','year','venue')]]

df.head()

Unnamed: 0,authorId,year,venue
0,3188285,2014,CLPsych@ACL
1,2782720,2018,Fig-Lang@NAACL-HLT
2,144748442,2021,ACL
3,46331602,2022,ACL
4,30887404,2019,FINDINGS


In [44]:
from sklearn import preprocessing

label = preprocessing.LabelEncoder()
data_encoded = pd.DataFrame() 

for i in df.columns :
  data_encoded[i]=label.fit_transform(df[i])

data_encoded.head()  

Unnamed: 0,authorId,year,venue
0,1571,35,58
1,1250,39,122
2,4134,42,5
3,3038,43,5
4,2444,40,119


In [48]:
from scipy.stats import chi2_contingency

def cramers_V(var1,var2) :
  crosstab =np.array(pd.crosstab(var1,var2, rownames=None, colnames=None)) # Cross table building
  stat = chi2_contingency(crosstab)[0] # Keeping of the test statistic of the Chi2 test
  obs = np.sum(crosstab) # Number of observations
  mini = min(crosstab.shape)-1 # Take the minimum value between the columns and the rows of the cross table
  return (stat/(obs*mini))

rows= []

for var1 in data_encoded:
  col = []
  for var2 in data_encoded :
    cramers =cramers_V(data_encoded[var1], data_encoded[var2]) # Cramer's V test
    col.append(round(cramers,2)) # Keeping of the rounded value of the Cramer's V  
  rows.append(col)
  
cramers_results = np.array(rows)
df2 = pd.DataFrame(cramers_results, columns = data_encoded.columns, index =data_encoded.columns)

df2


Unnamed: 0,authorId,year,venue
authorId,1.0,0.48,0.51
year,0.48,1.0,0.1
venue,0.51,0.1,1.0


Aight looks good