In [None]:
from transformers import BertTokenizer, BertModel
import torch
import spacy

# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Load the spaCy model for linguistic feature extraction
nlp = spacy.load('en_core_web_sm')

# Set the device for running the model (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def extract_linguistic_features(text):
    doc = nlp(text)
    features = {}

    # Part-of-speech (POS) tags
    pos_tags = [token.pos_ for token in doc]
    features['pos_tags'] = pos_tags

    # Named Entity Recognition (NER) entities
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    features['entities'] = entities

    # Dependency parsing
    dependency_tree = []
    for token in doc:
        dependency_tree.append((token.text, token.dep_, token.head.text))
    features['dependency_tree'] = dependency_tree

    return features

def calculate_similarity(sentence1, sentence2):
    # Tokenize the sentences
    inputs = tokenizer([sentence1, sentence2], padding=True, truncation=True, return_tensors='pt')
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Forward pass through the BERT model
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    # Get the BERT embeddings
    sentence_embeddings = outputs.last_hidden_state[:, 0, :]

    # Calculate the cosine similarity between the sentence embeddings
    similarity_score = torch.nn.functional.cosine_similarity(sentence_embeddings[0].unsqueeze(0), sentence_embeddings[1].unsqueeze(0))
    return similarity_score.item()

# Problem statement
problem_statement = "As the client, I am the manager of a busy public library. We are seeking a software company to develop a Library Management System (LMS) to replace our existing manual processes."

# Entities and relationships
entities = ["Library Management System", "book catalog", "user management", "circulation and borrowing", "public library", "library operations", "efficiency", "experience", "issued books", "staff", "patrons", "analytics"]
relationships = ["automate", "user authentication", "access control", "checkout", "return", "renewal", "improve", "enhance", "provide", "for"]

# Extract candidate domain concepts using semantic similarity
candidate_concepts = []
for entity in entities:
    max_score = -1.0
    max_relationship = ""

    for relationship in relationships:
        combined_text = f"{entity} {relationship}"
        similarity_score = calculate_similarity(problem_statement, combined_text)

        if similarity_score > max_score:
            max_score = similarity_score
            max_relationship = relationship

    candidate_concepts.append({
        'entity': entity,
        'relationship': max_relationship,
        'similarity_score': max_score
    })

# Extract linguistic features for the candidate concepts
for concept in candidate_concepts:
    linguistic_features = extract_linguistic_features(concept['entity'])
    concept['linguistic_features'] = linguistic_features

# Print the extracted candidate domain concepts
for concept in candidate_concepts:
    print("Entity:", concept['entity'])
    print("Relationship:", concept['relationship'])
    print("Similarity Score:", concept['similarity_score'])
    print("Linguistic Features:", concept['linguistic_features'])
    print()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Entity: Library Management System
Relationship: access control
Similarity Score: 0.7616750001907349
Linguistic Features: {'pos_tags': ['PROPN', 'PROPN', 'PROPN'], 'entities': [('Library Management System', 'ORG')], 'dependency_tree': [('Library', 'compound', 'Management'), ('Management', 'compound', 'System'), ('System', 'ROOT', 'System')]}

Entity: book catalog
Relationship: return
Similarity Score: 0.7209666967391968
Linguistic Features: {'pos_tags': ['NOUN', 'NOUN'], 'entities': [], 'dependency_tree': [('book', 'compound', 'catalog'), ('catalog', 'ROOT', 'catalog')]}

Entity: user management
Relationship: user authentication
Similarity Score: 0.7295514345169067
Linguistic Features: {'pos_tags': ['NOUN', 'NOUN'], 'entities': [], 'dependency_tree': [('user', 'compound', 'management'), ('management', 'ROOT', 'management')]}

Entity: circulation and borrowing
Relationship: access control
Similarity Score: 0.7200219631195068
Linguistic Features: {'pos_tags': ['NOUN', 'CCONJ', 'NOUN'], 'e