In [None]:
pip install gensim


In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

In [2]:

# Load the dataset
file_path = 'sctask_data_dev_v13_utf8.csv'  # Update with your file path
data = pd.read_csv(file_path, encoding='utf-8')

In [3]:
# Encode 'cat_item' as a numerical feature
cat_item_encoder = LabelEncoder()
data['cat_item_encoded'] = cat_item_encoder.fit_transform(data['cat_item'])

In [4]:
# Tokenize the short descriptions
data['short_description_tokenized'] = data['short_description'].str.split()

In [6]:


# Train a Word2Vec model
w2v_model = Word2Vec(
    sentences=data['short_description_tokenized'], 
    vector_size=100,  # Size of word vectors
    window=5,         # Context window size
    min_count=1,      # Minimum word frequency
    workers=4,        # Number of CPU threads
    sg=1              # Skip-gram model
)

# Generate sentence embeddings by averaging word vectors
def vectorize_sentence(tokens, model, vector_size):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)

data['short_description_vector'] = data['short_description_tokenized'].apply(
    lambda tokens: vectorize_sentence(tokens, w2v_model, vector_size=100)
)

# Create the feature matrix
X = pd.DataFrame(data['short_description_vector'].tolist())  # Expand vectors into separate columns
X['cat_item_encoded'] = data['cat_item_encoded']
X.columns = X.columns.astype(str)

# Encode the target 'kbid'
kbid_encoder = LabelEncoder()
data['kbid_encoded'] = kbid_encoder.fit_transform(data['kbid'])
y = data['kbid_encoded']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Fores
# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=kbid_encoder.classes_))


                                          precision    recall  f1-score   support

                  Basic_Mainframe_Access       1.00      1.00      1.00        15
                           Clean_Up_RACF       1.00      1.00      1.00        19
                               GuestWifi       1.00      1.00      1.00       109
      Kiteworks_Add_User_Existing_Folder       1.00      1.00      1.00        63
      Kiteworks_Create_New_Secure_Folder       1.00      1.00      1.00        66
          Kiteworks_Remove_Secure_Folder       1.00      1.00      1.00        46
            Off_Boarding_Integrity_Check       1.00      1.00      1.00        15
                                  Others       1.00      1.00      1.00      2654
                 Remove_Mainframe_Access       1.00      1.00      1.00        23
                computer_integrity_check       1.00      1.00      1.00        25
                   corporate_access_d365       1.00      1.00      1.00        52
               

In [None]:
## Use spacy instead of .split() function as below

In [None]:
# import spacy
# from gensim.models import Word2Vec
# import pandas as pd
# import numpy as np

# # Load SpaCy model
# nlp = spacy.load("en_core_web_sm")

# # Tokenize short descriptions using SpaCy
# def tokenize_with_spacy(text):
#     return [token.text.lower() for token in nlp(text) if not token.is_punct and not token.is_space]

# data['short_description_tokenized'] = data['short_description'].apply(tokenize_with_spacy)

# # Train Word2Vec as before
# w2v_model = Word2Vec(
#     sentences=data['short_description_tokenized'], 
#     vector_size=100, 
#     window=5, 
#     min_count=1, 
#     workers=4, 
#     sg=1
# )


In [None]:
## Transformer

In [None]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

# Load the dataset
file_path = 'sctask_data_dev_v13_utf8.csv'  # Update with your file path
data = pd.read_csv(file_path, encoding='utf-8')

# Encode 'cat_item' as a numerical feature
cat_item_encoder = LabelEncoder()
data['cat_item_encoded'] = cat_item_encoder.fit_transform(data['cat_item'])

# Load Hugging Face tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the short descriptions
def tokenize_with_hf(text):
    tokens = tokenizer.tokenize(text)
    return [token.lower() for token in tokens]

data['short_description_tokenized'] = data['short_description'].apply(tokenize_with_hf)

# Train a Word2Vec model
w2v_model = Word2Vec(
    sentences=data['short_description_tokenized'], 
    vector_size=100,  # Size of word vectors
    window=5,         # Context window size
    min_count=1,      # Minimum word frequency
    workers=4,        # Number of CPU threads
    sg=1              # Skip-gram model
)

# Generate sentence embeddings by averaging word vectors
def vectorize_sentence(tokens, model, vector_size):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)

data['short_description_vector'] = data['short_description_tokenized'].apply(
    lambda tokens: vectorize_sentence(tokens, w2v_model, vector_size=100)
)

# Create the feature matrix
X = pd.DataFrame(data['short_description_vector'].tolist())  # Expand vectors into separate columns
X['cat_item_encoded'] = data['cat_item_encoded']
X.columns = X.columns.astype(str)


# Encode the target 'kbid'
kbid_encoder = LabelEncoder()
data['kbid_encoded'] = kbid_encoder.fit_transform(data['kbid'])
y = data['kbid_encoded']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=kbid_encoder.classes_))

# Save the encoders, model, and Word2Vec
joblib.dump(model, 'kbid_predictor_model.pkl')
joblib.dump(cat_item_encoder, 'cat_item_encoder.pkl')
joblib.dump(kbid_encoder, 'kbid_encoder.pkl')
w2v_model.save('word2vec_model.bin')

# Usage example
def predict_kbid(cat_item, short_description):
    # Encode the categorical input
    cat_item_encoded = cat_item_encoder.transform([cat_item])[0]
    
    # Tokenize and vectorize the short description
    tokens = tokenizer.tokenize(short_description)
    short_desc_vectorized = vectorize_sentence(tokens, w2v_model, vector_size=100)
    
    # Combine features
    input_features = pd.DataFrame([short_desc_vectorized.tolist()])
    input_features['cat_item_encoded'] = cat_item_encoded
    
    # Align columns with training data
    input_features = input_features.reindex(columns=X.columns, fill_value=0)
    
    # Predict and decode the KBID
    kbid_encoded = model.predict(input_features)[0]
    return kbid_encoder.inverse_transform([kbid_encoded])[0]

# Example prediction
example_prediction = predict_kbid("Asset Management", "Dax Studio - Power BI Software Tool Integration")
print("Predicted KBID:", example_prediction)
