In [26]:
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from tqdm import tqdm
import model_names

# Load the model names
all_data = model_names.model_names





In [27]:
# Tokenize the data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_data = [tokenizer.encode_plus(text, add_special_tokens=True, max_length=128, padding='max_length', truncation=True, return_tensors='pt') for text in all_data]


In [28]:
# Prepare input tensors
input_ids = torch.cat([d['input_ids'] for d in tokenized_data], dim=0)
attention_masks = torch.cat([d['attention_mask'] for d in tokenized_data], dim=0)

In [29]:
# Create labels (0 for all data)
labels = torch.zeros(len(all_data))

# Split data into train and test
train_inputs, test_inputs, train_labels, test_labels = input_ids, input_ids, labels, labels
train_masks, test_masks = attention_masks, attention_masks

In [30]:

# Create DataLoader
batch_size = 4
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


In [31]:
# Load the pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)  # We're doing binary classification, so num_labels=1

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [32]:
# Set optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=2e-5)


# Fine-tune the model
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc="Epoch %d" % epoch):
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2].unsqueeze(1)}  # Unsqueezing labels to match shape
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print("Average training loss:", total_loss / len(train_dataloader))

Epoch 0: 100%|██████████| 777/777 [31:03<00:00,  2.40s/it]


Average training loss: 0.002400513429053215


Epoch 1: 100%|██████████| 777/777 [27:46<00:00,  2.14s/it]


Average training loss: 0.0006748705350412233


Epoch 2: 100%|██████████| 777/777 [39:22<00:00,  3.04s/it]   

Average training loss: 0.0003054956997014156





In [39]:
model.eval()
with torch.no_grad():
        keyword = "sklearn"  # Example keyword to classify
        inputs = tokenizer.encode_plus(keyword, add_special_tokens=True, max_length=128, padding='max_length', truncation=True, return_tensors='pt').to(device)
        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        prediction = torch.sigmoid(outputs.logits).item()
        if prediction > 0.5:
            print(f"'{keyword}' is classified as a model/library. Prediction: {prediction}")
        else:
            print(f"'{keyword}' is not classified as a model/library. Prediction: {prediction}")

'sklearn' is not classified as a model/library. Prediction: 0.49942097067832947


In [34]:
# # Save the model
# torch.save(model.state_dict(), "keyword_extraction_model.pth")

# # Export tokenizer to pickle file
# import pickle
# with open("tokenizer.pkl", "wb") as f:
#     pickle.dump(tokenizer, f)

In [2]:
# import numpy as np
# import torch
# from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
# from torch.utils.data import DataLoader, TensorDataset, RandomSampler
# from tqdm import tqdm
# import model_names

# # Load the model names
# all_data = model_names.model_names

# # Create labels (0 for all data)
# labels = torch.zeros(len(all_data))

# # Load tokenizer and model
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=1)

# # Tokenize the data
# tokenized_data = tokenizer(all_data, padding=True, truncation=True, return_tensors='pt')

# # Prepare input tensors
# input_ids = tokenized_data['input_ids']
# attention_masks = tokenized_data['attention_mask']

# # Create DataLoader
# batch_size = 16
# train_data = TensorDataset(input_ids, attention_masks, labels)
# train_sampler = RandomSampler(train_data)
# train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# # Set device to GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# # Set optimizer and learning rate
# optimizer = AdamW(model.parameters(), lr=2e-5)

# # Fine-tune the model
# num_epochs = 3
# for epoch in range(num_epochs):
#     model.train()
#     total_loss = 0
#     for batch in tqdm(train_dataloader, desc="Epoch %d" % epoch):
#         batch = tuple(t.to(device) for t in batch)
#         inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2].unsqueeze(1)}  # Unsqueezing labels to match shape
#         optimizer.zero_grad()
#         outputs = model(**inputs)
#         loss = outputs.loss
#         total_loss += loss.item()
#         loss.backward()
#         optimizer.step()
#     print("Average training loss:", total_loss / len(train_dataloader))

#     # Test classification
#     model.eval()
#     with torch.no_grad():
#         keyword = "scikit-learn"  # Example keyword to classify
#         inputs = tokenizer(keyword, return_tensors='pt').to(device)
#         outputs = model(**inputs)
#         prediction = torch.sigmoid(outputs.logits).item()
#         if prediction > 0.5:
#             print(f"'{keyword}' is classified as a model/library.")
#         else:
#             print(f"'{keyword}' is not classified as a model/library. Prediction: {prediction}")

# # Save the model
# torch.save(model.state_dict(), "keyword_extraction_model.pth")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 195/195 [04:29<00:00,  1.38s/it]


Average training loss: 0.0021020583918867396
'scikit-learn' is classified as a model/library.


Epoch 1: 100%|██████████| 195/195 [04:58<00:00,  1.53s/it]


Average training loss: 0.0015899744751946762
'scikit-learn' is not classified as a model/library. Prediction: 0.49545249342918396


Epoch 2: 100%|██████████| 195/195 [04:53<00:00,  1.51s/it]


Average training loss: 0.0010678176280821507
'scikit-learn' is classified as a model/library.


In [19]:
import torch
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import model_names

# Load the model names
all_data = model_names.model_names

# Load pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Prompts
model_prompt = "Is '{keyword}' used in machine learning or deep learning?"
library_prompt = "Is '{keyword}' a Python library commonly used for data analysis or machine learning?"
domain_prompt = "Does '{keyword}' appear to be a domain name of any field of work?"

# Function to classify a keyword
def classify_keyword(keyword):
    try:
        # Prepare prompts
        model_input_ids = tokenizer.encode(model_prompt.format(keyword=keyword), return_tensors="pt")
        library_input_ids = tokenizer.encode(library_prompt.format(keyword=keyword), return_tensors="pt")
        domain_input_ids = tokenizer.encode(domain_prompt.format(keyword=keyword), return_tensors="pt")

        # Generate text completions
        with torch.no_grad():
            model_outputs = model.generate(model_input_ids, max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
            library_outputs = model.generate(library_input_ids, max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
            domain_outputs = model.generate(domain_input_ids, max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

        # Decode the generated texts
        model_generated_text = tokenizer.decode(model_outputs[0], skip_special_tokens=True)
        library_generated_text = tokenizer.decode(library_outputs[0], skip_special_tokens=True)
        domain_generated_text = tokenizer.decode(domain_outputs[0], skip_special_tokens=True)

        # Check for keywords in generated texts
        if "yes" in model_generated_text.lower():
            return "Model"
        elif "yes" in library_generated_text.lower():
            return "Library"
        elif "yes" in domain_generated_text.lower():
            return "Domain"
        else:
            return "Other"
    except KeyError:
        return "Other"

# Example usage
keywords = [
    "scikit-learn",
    "nltk",
    "tensorflow",
    "pandas",
    "gensim",
    "domain.com",
    "model_name",
    "dfgrdrg"
]

for keyword in keywords:
    classification = classify_keyword(keyword)
    print(f"{keyword}: {classification}")


scikit-learn: Model
nltk: Model
tensorflow: Model
pandas: Model
gensim: Model
domain.com: Model
model_name: Library
dfgrdrg: Model
