In [35]:
import pickle
import os
import sys
import re
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [36]:
def parse_papers_groupby_newline(file_path):
    # Read the file and split by double newlines (delimiter between papers)
    with open(file_path, 'r') as file:
        paper_blocks = file.read().strip().split("\n\n")
    
    # Initialize list to store paper data
    papers = []
    
    # Process each paper block
    for block in paper_blocks:
        current_paper = {
            "title": None,
            "authors": None,
            "year": None,
            "venue": None,
            "index": None,
            "citations": [],
            "abstract": None,
        }
        # Process lines within the block
        for line in block.split("\n"):
            line = line.strip()
            if line.startswith('#*'):  # Title
                current_paper["title"] = line[2:]
            elif line.startswith('#@'):  # Authors
                current_paper["authors"] = line[2:]
            elif line.startswith('#t'):  # Year
                current_paper["year"] = line[2:]
            elif line.startswith('#c'):  # Venue
                current_paper["venue"] = line[2:]
            elif line.startswith('#index'):  # Index
                current_paper["index"] = line[6:]
            elif line.startswith('#%'):  # Citations
                current_paper["citations"].append(line[2:])
            elif line.startswith('#!'):  # Abstract
                current_paper["abstract"] = line[2:]
            
        papers.append(current_paper)
    
    # Convert to DataFrame
    df = pd.DataFrame(papers)
    df["citations"] = df["citations"].apply(lambda x: ";".join(x) if x else None)
    return df

# Usage
file_path = "Data/papers.txt" 
df = parse_papers_groupby_newline(file_path)

In [37]:
df

Unnamed: 0,title,authors,year,venue,index,citations,abstract
0,Automated Deduction in Geometry: 5th Internati...,"Hoon Hong,Dongming Wang",2006,,0,,
1,A+ Certification Core Hardware (Text & Lab Man...,Charles J. Brooks,2003,,1,,
2,Performance engineering in industry: current p...,"Ahmed E. Hassan,Parminder Flora",2007,Proceedings of the 6th international workshop ...,2,,This panel session discusses performance engin...
3,"Dude, You Can Do It! How to Build a Sweeet PC","Darrel Creacy,Carlito Vicencio",2005,,3,,Whether you're frustrated with current PC offe...
4,What Every Programmer Needs to Know about Secu...,"Neil Daswani,Anita Kesavan",2006,,4,,
...,...,...,...,...,...,...,...
629809,Mining A,,2008,Proceedings of the VLDB Endowment,629809,,
629810,Review article,,2008,Communications of the ACM,629810,,
629811,Multimodal system evaluation using modality ef...,"Manolis Perakakis,Alexandros Potamianos",2008,Proceedings of the 10th international conferen...,629811,294663;302639;572828,"In this paper, we propose two new objective me..."
629812,Computer System Architecture,V. K. Jain,2007,,629812,,


In [38]:
print(df.shape)

(629814, 7)


In [39]:
df[df['citations'].notnull()]

Unnamed: 0,title,authors,year,venue,index,citations,abstract
5,Interpreting Kullback-Leibler divergence with ...,"Shinto Eguchi,John Copas",2006,Journal of Multivariate Analysis,5,436405,Kullback-Leibler divergence and the Neyman-Pea...
17,Approximating fluid schedules in crossbar pack...,"Michael Rosenblum,Constantine Caramanis,Michel...",2006,IEEE/ACM Transactions on Networking (TON),17,357875;214023;317448;319987;334185;95255;29412...,We consider a problem motivated by the desire ...
24,On product covering in 3-tier supply chain mod...,"Jianer Chen,Fenghui Zhang",2006,Theoretical Computer Science,24,251778;436906;623227;287885,The field of supply chain management has been ...
35,An Integrative Modelling Approach for Simulati...,"Tibor Bosse,Catholijn M. Jonker,Jan Treur",2006,Proceedings of the 39th annual Symposium on Si...,35,247215;618899,To simulate adaptive agents with abilities mat...
53,Class-specific feature polynomial classifier f...,"Cheng-Lin Liu,Hiroshi Sako",2006,Pattern Recognition,53,159598;603894;586607;471174;87254;302739;449562,The polynomial classifier (PC) that takes the ...
...,...,...,...,...,...,...,...
629803,Visualizing Proof Search for Theorem Prover De...,"John Byrnes,Michael Buchanan,Michael Ernst,Phi...",2009,Electronic Notes in Theoretical Computer Scien...,629803,101969;556422;566234;573428;622705,We describe an interactive visualization tool ...
629804,SENTINEL: a semantic business process monitori...,"Carlos Pedrinaci,Dave Lambert,Branimir Wetzste...",2008,Proceedings of the first international worksho...,629804,12156;29272;29779;88763;261856;340817;408390;4...,Business Activity Monitoring (BAM) aims to sup...
629806,Effectiveness and usability of an online help ...,"Jérôme Simonin,Noëlle Carbonell,Danielle Pelé",2008,Proceedings of the 10th international conferen...,629806,8543;327540;395578;397153;398612,An empirical study is presented which aims at ...
629807,Busy period analysis of finite QBD processes,"Chaitanya Garikiparthi,Appie van de Liefvoort,...",2008,ACM SIGMETRICS Performance Evaluation Review,629807,340965,We present the number of customers served and ...


In [40]:
df[df['citations'].notnull() & df['venue'].isnull()]

Unnamed: 0,title,authors,year,venue,index,citations,abstract


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 629814 entries, 0 to 629813
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   title      629814 non-null  object
 1   authors    629814 non-null  object
 2   year       629814 non-null  object
 3   venue      629814 non-null  object
 4   index      629814 non-null  object
 5   citations  125372 non-null  object
 6   abstract   281080 non-null  object
dtypes: object(7)
memory usage: 33.6+ MB


In [42]:
df.describe()

Unnamed: 0,title,authors,year,venue,index,citations,abstract
count,629814,629814.0,629814,629814.0,629814,125372,281080
unique,617518,480994.0,67,12610.0,629814,117185,279014
top,Preface,,2008,,629813,246511,NO SUPPLIED
freq,306,25916.0,45378,98594.0,1,64,56


endegging
all-minilml6v2

prendi n papers tra cui quello da predirre e dai attraverso gli embeddings quale è il più probabile che citi

In [43]:
# import the picke file 
with open('Data/pid_cat_dict.pkl', 'rb') as file:
    picke_df = pickle.load(file)

picke_df = pd.DataFrame(picke_df.items(), columns=['index', 'categories'])

In [44]:
picke_df

Unnamed: 0,index,categories
0,0,"[13, 18]"
1,1,"[18, 25]"
2,2,"[6, 11]"
3,3,[18]
4,4,"[17, 19]"
...,...,...
629809,629809,[34]
629810,629810,[33]
629811,629811,"[1, 21]"
629812,629812,"[1, 18]"


In [45]:
# Merge the two dataframes
df = df.merge(picke_df, on='index', how='left')

In [46]:
df

Unnamed: 0,title,authors,year,venue,index,citations,abstract,categories
0,Automated Deduction in Geometry: 5th Internati...,"Hoon Hong,Dongming Wang",2006,,0,,,"[13, 18]"
1,A+ Certification Core Hardware (Text & Lab Man...,Charles J. Brooks,2003,,1,,,"[18, 25]"
2,Performance engineering in industry: current p...,"Ahmed E. Hassan,Parminder Flora",2007,Proceedings of the 6th international workshop ...,2,,This panel session discusses performance engin...,"[6, 11]"
3,"Dude, You Can Do It! How to Build a Sweeet PC","Darrel Creacy,Carlito Vicencio",2005,,3,,Whether you're frustrated with current PC offe...,[18]
4,What Every Programmer Needs to Know about Secu...,"Neil Daswani,Anita Kesavan",2006,,4,,,"[17, 19]"
...,...,...,...,...,...,...,...,...
629809,Mining A,,2008,Proceedings of the VLDB Endowment,629809,,,[34]
629810,Review article,,2008,Communications of the ACM,629810,,,[33]
629811,Multimodal system evaluation using modality ef...,"Manolis Perakakis,Alexandros Potamianos",2008,Proceedings of the 10th international conferen...,629811,294663;302639;572828,"In this paper, we propose two new objective me...","[1, 21]"
629812,Computer System Architecture,V. K. Jain,2007,,629812,,,"[1, 18]"


In [47]:
categories_flat = [category for sublist in df['categories'] for category in sublist]
unique_categories = sorted(set(categories_flat))

for category in unique_categories:
    df[f'category_{category}'] = df['categories'].apply(lambda x: 1 if category in x else 0)

In [48]:
# give the rows which have empty authors
df['authors'] = df['authors'].apply(lambda x: x.split(','))
df[df['authors'].apply(lambda x: len(x)) == 0]

Unnamed: 0,title,authors,year,venue,index,citations,abstract,categories,category_0,category_1,...,category_25,category_26,category_27,category_28,category_29,category_30,category_31,category_32,category_33,category_34
