In [128]:

# Steps for preprocessing:
    # Tokenization?
    # Lower casing
    # Stop words removal
    # Stemming?
    # Lemmatization?
    # Bag of words/W2V, CBOW, Skip-gram
    # TF-IDF
import pandas as pd
from rapidfuzz import process, fuzz
import matplotlib.pyplot as plt

import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [244]:
abstracts = pd.read_csv('./abstracts.csv')
abstracts.columns = ['work_id', 'abstract_content']
abstracts.head()

Unnamed: 0,work_id,abstract_content
0,W2089745446,The research access/impact problem arises beca...
1,W2100379340,Information technology (IT) acceptance researc...
2,W1981083189,"Edward O. Wilson, in his famous work, Sociobio..."
3,W1560783210,"Open access (OA) is free, unrestricted access ..."
4,W2463568293,"Open access, open data, open source and other ..."


In [112]:
def text_similarity(text1, text2):
    # Tokenize and lemmatize the texts
    tokens1 = word_tokenize(text1)
    tokens2 = word_tokenize(text2)
    lemmatizer = WordNetLemmatizer()
    tokens1 = [lemmatizer.lemmatize(token) for token in tokens1]
    tokens2 = [lemmatizer.lemmatize(token) for token in tokens2]

    # Remove stopwords
    stop_words = stopwords.words('english')
    tokens1 = [token for token in tokens1 if token not in stop_words]
    tokens2 = [token for token in tokens2 if token not in stop_words]

    # Create the TF-IDF vectors
    vectorizer = TfidfVectorizer()
    vector1 = vectorizer.fit_transform(tokens1)
    vector2 = vectorizer.transform(tokens2)

    # Calculate the cosine similarity
    similarity = cosine_similarity(vector1, vector2, dense_output=True)
    return similarity

def cosine_match(target_ref:str, open_alex_works):
    similarities = []
    for i, work in open_alex_works.iterrows():
        print(work)
        similarity = text_similarity(target_ref, work['abstract_content'])
        work_similarity = {
                work['work_id']: similarity
            }
        similarities.append(work_similarity)
        # print(work_similarity)
    return similarities

In [260]:
target_ref = [abstracts[0:1]]
target_ref
open_alex_works = abstracts
open_alex_works['work_id']
abstracts.iloc[0]


work_id                                                   W2089745446
abstract_content    The research access/impact problem arises beca...
Name: 0, dtype: object

In [113]:
for i, row in open_alex_works.iterrows():
    print(row['work_id'])

W2100379340
W1981083189


In [297]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(abstracts['abstract_content']) #input all document
similarities = []
# len(vectors)
vectors[0]
for i in range(len(abstracts)):
#     cosine_similarity(np.array(vectors[0], vectors[i]))

#     # Calculate the cosine similarity between the vectors
    similarity = [ abstracts.iloc[i]['work_id'],cosine_similarity(vectors[0], vectors[i]), abstracts.iloc[i]['abstract_content']] 
#     # cosine_similarity(vectors[0], vectors[i]) for i in range ... 
    # cosine_similarity( np.array( vectors[index for query] , vectors[index for comparison] ) )
    
    similarities.append(similarity)
df = pd.DataFrame(similarities)
df.columns = ['work_id', 'cosine_similarity','abstract_content']
df
print(abstracts.iloc[0]['abstract_content'], abstracts.iloc[62]['abstract_content'])


The research access/impact problem arises because journal articles are not accessible to all of their would-be users; hence, they are losing potential research impact. The solution is to make all articles Open Access (OA; i.e., accessible online, free for all). OA articles have significantly higher citation impact than non-OA articles. There are two roads to OA: the “golden” road (publish your article in an OA journal) and the “green” road (publish your article in a non-OA journal but also self-archive it in an OA archive). Only 5% of journals are gold, but over 90% are already green (i.e., they have given their authors the green light to self-archive); yet only about 10â€“20% of articles have been self-archived. To reach 100% OA, self-archiving needs to be mandated by researchers' employers and funders, as the United Kingdom and the United States have recently recommended, and universities need to implement that mandate. The research access/impact problem arises because journal articl

In [195]:


def rapidfuzz_match(extracted_references: list, openalex_works: list, scorer = fuzz.WRatio):
    top_match = []
    second_match = []
    third_match = []
    top_names = []
    top_scores = []
    top_indexes = []
    second_scores = []
    second_indexes = []
    second_names = []
    choices = openalex_works
    for i, reference in extracted_references.iterrows():
        print(choices)
        # possible scorers are fuzz.WRatio , fuzz.partial_ratio , fuzz.token_set_ratio , fuzz.partial_token_set_ratio , fuzz.token_sort_ratio
        top, second, third = process.extract(reference['abstract_content'], choices['abstract_content'], scorer=scorer, limit=3)
        top_score = top[1]
        top_index = top[2]
        top_name = top[0]
        top_names.append(top_name)
        top_scores.append(top_score)
        top_indexes.append(top_index)
        top_match.append(top)
        second_match.append(second)
        second_score = second[1]
        second_index = second[2]
        second_name = second[0]
        second_names.append(second_name)
        second_scores.append(second_score)
        second_indexes.append(second_index)
        third_match.append(third)
    matched_df = pd.DataFrame(list(zip(extracted_references, top_match,top_names, top_scores, top_indexes, second_match, second_names, second_scores, second_indexes, third_match)), columns=['extracted_reference', 'top_match', 'top_names', 'top_scores', 'top_indexes', 'second_match', 'second_names', 'second_scores', 'second_indexes', 'third_match'])
    return matched_df



In [197]:
# target_ref = [abstracts[0:1]]
# target_ref
# open_alex_works = abstracts[1:]
# open_alex_works['work_id']
# abstracts.iloc[0]

