In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
proj_dir = Path.cwd().parent.parent
path_label_dir = proj_dir / "data/processed/labels/labels_complete"
path_label_dir

PosixPath('/home/tim/Documents/arxiv-code-search/data/processed/labels/labels_complete')

In [12]:
def summarize_final_label_file(file_path, n_articles):

    # get file_name from file_path
    file_name = file_path.stem

    if file_name.endswith(".csv"):
        df = pd.read_csv(file_path, parse_dates=['update_date'])
    else:
        df = pd.read_excel(file_path, parse_dates=['update_date'], 
                            engine='odf', names=["id", "pattern", "token_count", "update_date", "label", "para"],)
    
    # group id and aggregate by max label
    dfr = df.groupby(["id"]).agg({"label": "max"}).reset_index()

    # replace any NaN in "label" column with 0
    dfr.label.fillna(0, inplace=True)

    # the number of articles may have been more than what is contained in the file (because no keywords may have been found)
    n_articles_df = len(dfr)

    df_label_count = dfr.groupby('label').count().reset_index().astype(int)
    df_label_count = df_label_count.iloc[:,:2]
    df_label_count.columns = ['label', 'count']

    # the difference between n_articles and n_articles_df is the number of articles that had no keywords in the search
    df_label_count.loc[0, 'count'] += n_articles - n_articles_df

    # create new column called "label_name" where the value is 0 if the label is 0, 1 if the label is greater than 0
    df_label_count['label_name'] = df_label_count.label.apply(
        lambda x: "Data and Code Not Available" if x == 0 else "Data or Code Publicly Available"
        )
    df_label_count = df_label_count.groupby(["label_name"]).agg({"count": "sum"}).reset_index()
    df_label_count['percentage'] = df_label_count['count'] / df_label_count['count'].sum() * 100

    return df_label_count

In [15]:
# file_name = "labels_mssp_99_0-100.ods"
file_name = "labels_energies_98_0-150.ods"

df = summarize_final_label_file(path_label_dir / file_name, n_articles=124)
df

Unnamed: 0,label_name,count
0,Data and Code Not Available,117
1,Data or Code Publicly Available,7


# Scratch

In [29]:
file_name = "labels_mssp_99_0-100.ods"
# file_name = "labels_energies_98_0-150.ods"

if file_name.endswith(".csv"):
    df = pd.read_csv(path_label_dir / file_name, parse_dates=['update_date'])
else:
    df = pd.read_excel(path_label_dir / file_name, parse_dates=['update_date'], 
                        engine='odf', names=["id", "pattern", "token_count", "update_date", "label", "para"],)
                        

df.head()

Unnamed: 0,id,pattern,token_count,update_date,label,para
0,10.1016_j.ymssp.2015.05.028,data,14,NaT,,-T denotes the set of all measured data and f ...
1,10.1016_j.ymssp.2015.05.028,data,33,NaT,,These new fx0; P0; Q ; Rg can be used to proce...
2,10.1016_j.ymssp.2015.05.028,data,59,NaT,,An algorithm named Unscented Rauch–Tung–Strieb...
3,10.1016_j.ymssp.2015.05.028,data,88,NaT,,"In this paper, the study is limited to the ana..."
4,10.1016_j.ymssp.2015.05.028,data,159,NaT,,"Finally, the influence of the M4 bolts torque ..."


In [30]:
# group id and aggregate by max label
dfr = df.groupby(["id"]).agg({"label": "max"}).reset_index()

# replace any NaN in "label" column with 0
dfr.label.fillna(0, inplace=True)
print("Number of articles in df:", len(dfr))
dfr.head()

Number of articles in df: 85


Unnamed: 0,id,label
0,10.1016_j.ymssp.2015.05.028,0.0
1,10.1016_j.ymssp.2016.04.014,0.0
2,10.1016_j.ymssp.2016.04.028,1.0
3,10.1016_j.ymssp.2016.05.026,0.0
4,10.1016_j.ymssp.2016.07.049,0.0


In [31]:
# the number of articles may have been more than what is contained in the file (because no keywords may have been found)
n_articles = 100
n_articles_df = len(dfr)

df_label_count = dfr.groupby('label').count().reset_index().astype(int)
df_label_count = df_label_count.iloc[:,:2]
df_label_count.columns = ['label', 'count']

# the difference between n_articles and n_articles_df is the number of articles that had no keywords in the search
df_label_count.loc[0, 'count'] += n_articles - n_articles_df
df_label_count

Unnamed: 0,label,count
0,0,92
1,1,7
2,3,1


In [32]:
# create new column called "label_name" where the value is 0 if the label is 0, 1 if the label is greater than 0
df_label_count['label_name'] = df_label_count.label.apply(
    lambda x: "Data and Code Not Available" if x == 0 else "Data or Code Publicly Available"
    )
df_label_count = df_label_count.groupby(["label_name"]).agg({"count": "sum"}).reset_index()

# add percentage column of total
df_label_count['percentage'] = df_label_count['count'] / df_label_count['count'].sum() * 100
df_label_count

Unnamed: 0,label_name,count,percentage
0,Data and Code Not Available,92,92.0
1,Data or Code Publicly Available,8,8.0
