In [1]:
from pathlib import Path
from bs4 import BeautifulSoup
import json
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
for path in Path('./data/').rglob('*.json'):
    print(path)
    break


data/2022/9973293.json


In [3]:
class Paper:

    def __init__(self, path):
        self.path = path
        self.json = json.load(open(path, encoding='utf-8'))
        self.title = self.json['articleTitle']
        self.number = self.json['articleNumber']
        self.authors = self.json['authors']
        self.abstract = self.json['abstract']
        self.text = self._get_text()

    def __str__(self):
        return f'{self.title}'

    def __repr__(self):
        return f'{self.title}'

    def _get_text(self):
        text = self.abstract + "\n" + "\n".join([p.text for p in BeautifulSoup(self.json['xml'], 'lxml').find_all('p')])
        return self._clean_text(text)

    def _clean_text(text):
        regex = r"CCBY - IEEE.*|\[\d+\]|\$.*\$|View Source.*|\\begin.*|FIGURE \d+|Fig. \d+|[^A-Za-z0-9^ ]|SECTION [A-Z]+|\t\t|\n|Eq \d+|  "
        regex_empty = r" +"
        regex_eqns = r"Eq \d+|Lemma \d+|section \d+|section \d+ \d+|From \d+|Eqs[^a-z^A-Z]+"
        result = re.sub(regex, " ", text, 0, re.MULTILINE)
        result = re.sub(regex_empty, " ", result, 0, re.MULTILINE)
        result = re.sub(regex_eqns, "", result, 0, re.MULTILINE).strip()
        return result


In [27]:
print(papers[2].text[:500])

CCBY - IEEE is not the copyright holder of this material. Please follow the instructions via https://creativecommons.org/licenses/by/4.0/ to obtain full-text articles and stipulations in the API documentation.

		                        SECTION I.IntroductionWith our increasing reliance on mobile devices, people keep valuable information in their devices and online storage. The wicked people get lured by the potential access to confidential information mainly for illicit financial gain, and othe


In [35]:
with open('paper.xml', 'w', encoding='utf-8') as f:
    f.write(papers[0].json['xml'])

In [36]:
soup = 



In [40]:
print(papers[0].title)

Fisher Information Matrix and its Application of Bipolar Activation Function Based Multilayer Perceptrons With General Gaussian Input


In [42]:
print(papers[0].abstract)

For the widely used multilayer perceptrons (MLPs), there exist singularities in the parameter space where Fisher information matrix (FIM) degenerates on these subspaces. The singularities seriously influence the learning dynamics of MLPs which have attracted many researchers’ attentions. As FIM plays key role in investigating the singular learning dynamics of MLPs, it is very important to obtain t...


In [46]:
for p in soup.find_all('p'):
    print(p.text)
    break

As one of the most important subject in computer science, artificial intelligence has been developed fast in the last years and has been successfully applied in various areas and applications [1], [2], such as pattern recognition, computer vision, intelligence control etc [3], [4], [5]. For artificial intelligence, artificial neural networks play key roles in achieving such outstanding performance [6], [7]. Multilayer perceptrons (MLPs), which are typical feedforward neural networks, also have been widely applied in artificial intelligence [8], [9]. The main advantages of multilayer perceptrons are that they are easy to handle and can approximate any continuous function arbitrary well.


In [None]:
regex = r"CCBY - IEEE.*|\[\d+\]|\$.*\$|View Source.*|\\begin.*|FIGURE \d+|Fig. \d+|[^A-Za-z0-9^ ]|SECTION [A-Z]+|\t\t|\n|Eq \d+|  "

regex_empty = r" +"
regex_eqns = r"Eq \d+|Lemma \d+|section \d+|section \d+ \d+|From \d+|Eqs[^a-z^A-Z]+"

test_str = papers[0].text

result = re.sub(regex, " ", test_str, 0, re.MULTILINE)
result = re.sub(regex_empty, " ", result, 0, re.MULTILINE).strip()
result = re.sub(regex_eqns, "", result, 0, re.MULTILINE).strip()
if result:
    print (result)



In [None]:
papers = [Paper(path) for path in Path('./data/').rglob('*.json')]
print(papers[0].text)

In [5]:
with open('./test.txt','r') as f:
    test_text = f.read()

In [6]:
data_set = [paper.abstract for paper in papers] + [test_text]
len(data_set)

51

In [7]:
data_set[50]

'Beamforming, user scheduling and transmit power on existing interference management schemes in multi-cell mmWave networks have been independently controlled due to the high computational complexity of the problem. In this paper, we formulate a long-term utility maximization problem where beam activation, user scheduling and transmit power are incorporated in a single framework.'

In [8]:
data_set[0]

'For the widely used multilayer perceptrons (MLPs), there exist singularities in the parameter space where Fisher information matrix (FIM) degenerates on these subspaces. The singularities seriously influence the learning dynamics of MLPs which have attracted many researchers’ attentions. As FIM plays key role in investigating the singular learning dynamics of MLPs, it is very important to obtain t...'

In [9]:
count_vect = CountVectorizer()
term_freq_matrix=count_vect.fit_transform(data_set)

tfidf = TfidfTransformer()
tf_idf_matrix = tfidf.fit_transform(term_freq_matrix)



In [10]:
tf_idf_matrix.shape

(51, 1171)

In [11]:
similarity=cosine_similarity(tf_idf_matrix[0], tf_idf_matrix[50])
print('The similarity between 2 documents is: ',similarity[0][0])

The similarity between 2 documents is:  0.05819476225102631


In [12]:
papers[0].json['articleNumber']

'9973293'

In [13]:
for i in range(len(data_set)-1):
    similarity = cosine_similarity(tf_idf_matrix[len(data_set)-1], tf_idf_matrix[i])
    paper_number = papers[i].json['articleNumber']
    print(f'The similarity between text and paper {paper_number} documents is: ',similarity[0][0]*100,'%')

The similarity between text and paper 9973293 documents is:  5.819476225102631 %
The similarity between text and paper 9973315 documents is:  11.416628339836473 %
The similarity between text and paper 9976057 documents is:  11.886337702160521 %
The similarity between text and paper 9973235 documents is:  96.87320554765432 %
The similarity between text and paper 9973236 documents is:  8.73169377014394 %
The similarity between text and paper 9973237 documents is:  3.353209925704058 %
The similarity between text and paper 9973238 documents is:  4.381576733228687 %
The similarity between text and paper 9973239 documents is:  5.700446612519094 %
The similarity between text and paper 9973241 documents is:  6.340685401795887 %
The similarity between text and paper 9973243 documents is:  11.731553880252788 %
The similarity between text and paper 9973244 documents is:  5.577007547929773 %
The similarity between text and paper 9973245 documents is:  7.780611339087515 %
The similarity between tex