In [2]:
# 导入包
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import jieba
import torch
import pandas
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

In [3]:
# 导入数据
df = pandas.read_csv("patent_verified.csv")
df["x"] = df["Application Id"].map(str) + " " + df["Application Date"].map(str) + " " + df["Publication Date"].map(str) + " " + df["Country"].map(str) + " " + df["Title"].map(str) + " " + df["Abstract"].map(str) + " " + df["Applicants"].map(str) + " " + df["Inventors"].map(str)
patents = df["x"].values.tolist()
labels = df["y"].values.tolist()
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\baoju\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# 清洗数据

def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # 移除特殊字符
    text = re.sub(r'\s+', ' ', text)  # 移除多余空格
    return text.lower()

# 分语言处理和分词
def tokenize(text):
    words = []
    for word in text.split():
        if re.search('[\u4e00-\u9fff]', text):  # 检测中文字符      
            words.extend(jieba.cut(word))
        else:
            words.append(word)
    return words

# 去除停用词
def remove_stopwords(words):
    stop_words_en = set(stopwords.words('english'))
    stop_words_zh = set(["的", "这是", "和", "..."])  # 示例中文停用词
    return [word for word in words if word not in stop_words_en and word not in stop_words_zh]


# 预处理文本
processed_texts = []
for text in patents:
    text = clean_text(text)
    words = tokenize(text)
    words = remove_stopwords(words)
    processed_texts.append(' '.join(words))

# 向量化
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(processed_texts)


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\baoju\AppData\Local\Temp\jieba.cache
Loading model cost 0.924 seconds.
Prefix dict has been built successfully.


In [5]:
# 创建PyTorch数据集

X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix.toarray(), labels, test_size=0.3)
X_train_tensor = torch.tensor(X_train, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

# 构建 LSTM 网络模型
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])
        x = self.sigmoid(x)
        return x

# 实例化模型
model = LSTMClassifier(embedding_dim=100, hidden_dim=128, vocab_size=tfidf_matrix.shape[0])

# 损失函数和优化器
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [6]:
# 损失函数和优化器
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
num_epochs = 10
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        # 前向传播
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels)

        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')
    if loss.item() < 0.15:
        break

Epoch 1/10, Loss: 0.5286474823951721
Epoch 2/10, Loss: -0.13323327898979187


In [7]:

X_test_tensor = torch.tensor(X_test, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    predicted = (outputs.squeeze() > 0.5).float()
    accuracy = accuracy_score(y_test_tensor, predicted)
    print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.4838709677419355
