### Dataset overview

In [1]:

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from ast import literal_eval
# is used for safely evaluating strings containing Python literals or container displays
# (e.g., lists, dictionaries) to their corresponding Python objects.


In [2]:
%pwd

'c:\\Users\\yashr\\Projects\\SmartScholar\\SmartScholar\\research_notebook'

In [4]:
arxiv_data = pd.read_csv("dataset/arxiv_data_210930-054931.csv")

In [5]:
arxiv_data.head()

Unnamed: 0,terms,titles,abstracts
0,['cs.LG'],Multi-Level Attention Pooling for Graph Neural...,Graph neural networks (GNNs) have been widely ...
1,"['cs.LG', 'cs.AI']",Decision Forests vs. Deep Networks: Conceptual...,Deep networks and decision forests (such as ra...
2,"['cs.LG', 'cs.CR', 'stat.ML']",Power up! Robust Graph Convolutional Network v...,Graph convolutional networks (GCNs) are powerf...
3,"['cs.LG', 'cs.CR']",Releasing Graph Neural Networks with Different...,With the increasing popularity of Graph Neural...
4,['cs.LG'],Recurrence-Aware Long-Term Cognitive Network f...,Machine learning solutions for pattern classif...


In [15]:
arxiv_data.sample(5)

Unnamed: 0,terms,titles,abstracts
8385,['cs.CV'],Learning for Visual Navigation by Imagining th...,Visual navigation is often cast as a reinforce...
9778,"['cs.LG', 'cs.SY', 'eess.SY', 'stat.ML']",Reinforcement Learning for Thermostatically Co...,The aim of the project is to investigate and a...
23803,['cs.CV'],FT-TDR: Frequency-guided Transformer and Top-D...,Blind face inpainting refers to the task of re...
37576,['cs.CV'],Weakly Supervised Lesion Co-segmentation on CT...,Lesion segmentation in medical imaging serves ...
25141,['cs.CV'],TransReID: Transformer-based Object Re-Identif...,Extracting robust feature representation is on...


In [7]:
#### Data Cleaning and Preprocessing

In [6]:
arxiv_data.shape

(56181, 3)

In [7]:
arxiv_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56181 entries, 0 to 56180
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   terms      56181 non-null  object
 1   titles     56181 non-null  object
 2   abstracts  56181 non-null  object
dtypes: object(3)
memory usage: 1.3+ MB


In [8]:
arxiv_data.describe()

Unnamed: 0,terms,titles,abstracts
count,56181,56181,56181
unique,3402,41105,41115
top,['cs.CV'],Deep Reinforcement Learning: An Overview,Mesh is a powerful data structure for 3D shape...
freq,18719,7,7


In [9]:
arxiv_data.isnull().sum()

terms        0
titles       0
abstracts    0
dtype: int64

In [10]:
arxiv_data.duplicated().sum()

15054

In [11]:
# getting unique labels
labels_column = arxiv_data['terms'].apply(literal_eval)
labels = labels_column.explode().unique()
print("labels :",labels)
print("lenght :",len(labels))

labels : ['cs.LG' 'cs.AI' 'cs.CR' ... 'D.1.3; G.4; I.2.8; I.2.11; I.5.3; J.3'
 '68T07, 68T45, 68T10, 68T50, 68U35' 'I.2.0; G.3']
lenght : 1177


In [12]:
# remove duplicate entries based on the "titles" (terms) column
# This filters the DataFrame, keeping only the rows where the titles are not duplicated.
arxiv_data = arxiv_data[~arxiv_data['titles'].duplicated()]
print(f"There are {len(arxiv_data)} rows in the deduplicated dataset.")
# There are some terms with occurrence as low as 1.
print(sum(arxiv_data['terms'].value_counts()==1))
# how many unique terms
print(arxiv_data['terms'].nunique())

There are 41105 rows in the deduplicated dataset.
2503
3401


In [13]:
# Filtering the rare terms. (it keeps only those rows where the "terms" value occurs more than once in the original DataFrame.)
arxiv_data_filtered = arxiv_data.groupby('terms').filter(lambda x: len(x) > 1)
arxiv_data_filtered.shape

(38602, 3)

In [14]:
# It evaluates the given string containing a Python literal or container display (e.g., a list or dictionary) and returns the corresponding Python object.
arxiv_data_filtered['terms'] = arxiv_data_filtered['terms'].apply(lambda x: literal_eval(x))
arxiv_data_filtered['terms'].values[:3]

array([list(['cs.LG']), list(['cs.LG', 'cs.AI']),
       list(['cs.LG', 'cs.CR', 'stat.ML'])], dtype=object)