In [2]:
import itertools
import pickle
import re
import tempfile
from time import time

import matplotlib
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import spacy
from gensim.models import KeyedVectors, Word2Vec
from scipy import spatial
from sklearn import metrics
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [3]:
url = "http://127.0.0.1:4000/api/paper"
papers = requests.get(url).json()
corpus = []
label = []

In [None]:
for data in papers:
    text = data["title"] + ". " + data["abstract"]
    result = re.findall(r"[(.*?)]", text, re.M)
    for r in result:
        if "et" in r:
            text = text.replace(f" [{r}]", "")
    corpus.append(text)
    label.append(data["topo_label"])
X_train, X_test, y_train, y_test = train_test_split(corpus, label, test_size=0.3)

In [3]:
nlp = spacy.load("en_core_sci_lg")

def spacy_tokenizer(sample):
    doc = nlp(str(sample))
    tokens = [word.lemma_ for word in doc if not word.is_stop if len(word.text) > 1]
    return tokens

In [4]:
t0 = time()
print("开始分词")
raw_tokens = [spacy_tokenizer(sample) for sample in corpus]
print("结束分词，总计用时：", time() - t0)

开始分词
结束分词，总计用时： 546.0849168300629


In [5]:
t0 = time()
print("开始w2v模型")
model = Word2Vec(raw_tokens, min_count=5, vector_size=300, workers=4, sg=1, hs=1, window=10)

print(len(model.wv.index_to_key))
print("结束w2v模型，总计用时：", time() - t0)

开始w2v模型
15530
结束w2v模型，总计用时： 107.60916495323181


In [1]:
with tempfile.NamedTemporaryFile(prefix="gensim-model-", dir="D:\database\python\spider\model", delete=False) as tmp:
    temporary_filepath = tmp.name
    model.save(temporary_filepath)
# model = Word2Vec.load("./model/gensim-model-wa7o86qv")
model.wv.save("model/word2vec.kv")

NameError: name 'tempfile' is not defined

In [9]:
model.wv.most_similar('kagome',topn=50)

[('vanadium-based', 0.6092997789382935),
 ('av3sb5', 0.6051902174949646),
 ('AV3Sb5', 0.595984160900116),
 ('fesn', 0.595007061958313),
 ('breathe', 0.5882386565208435),
 ('rmn6sn6', 0.5645654797554016),
 ('kv3sb5', 0.5464086532592773),
 ('csv3sb5', 0.5401225090026855),
 ('distorted', 0.5388354659080505),
 ('fe3sn2', 0.5379699468612671),
 ('corner-sharing', 0.5356065630912781),
 ('herbertsmithite', 0.534034013748169),
 ('kagome-lattice', 0.5309625864028931),
 ('cosn', 0.5253282785415649),
 ('triangular', 0.5247768759727478),
 ('intertwining', 0.5101521611213684),
 ('zncu3(oh)6cl2', 0.5086246728897095),
 ('v-based', 0.507774829864502),
 ('non-trivial', 0.5058199167251587),
 ('haf', 0.5003094673156738),
 ('antiferromagnets', 0.4997885525226593),
 ('tripod', 0.4969598948955536),
 ('averievite', 0.49079686403274536),
 ('Mn3Ge', 0.4859919548034668),
 ('swedenborgite', 0.4858311116695404),
 ('rb', 0.4855373203754425),
 ('cs', 0.47623759508132935),
 ('Kagome', 0.47328680753707886),
 ('husimi'