In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

In [11]:
# Load the dataset
file_path = 'sctask_data_dev_v13_utf8.csv'  # Update with your file path
data = pd.read_csv(file_path, encoding='utf-8')

In [12]:
data

Unnamed: 0,cat_item,ticket_management,kbid,short_description
0,Asset Management,Others,Others,Dax Studio - Power BI Software Tool Integrati...
1,Service Introduction,Others,Others,DEL-01 - Project Initiation Document
2,Service Introduction,Others,Others,DEL-01 - Project Initiation Document
3,Service Introduction,Others,Others,DEL-01 - Project Initiation Document
4,Service Introduction,Others,Others,DEL-01 - Project Initiation Document
...,...,...,...,...
16298,Cross-Boarding,WFM,cross_boarding_imc,Cross-boarding: Review Access Request in Iron ...
16299,Cross-Boarding,WFM,cross_boarding_imc,Cross-boarding: Review Access Request in Iron ...
16300,Cross-Boarding,WFM,cross_boarding_imc,Cross-boarding: Review Access Request in Iron ...
16301,Cross-Boarding,WFM,cross_boarding_imc,Cross-boarding: Review Access Request in Iron ...


In [13]:

# Encode 'cat_item' as a numerical feature
cat_item_encoder = LabelEncoder()
data['cat_item_encoded'] = cat_item_encoder.fit_transform(data['cat_item'])

# Vectorize 'short_description' using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=500)  # Limit features to avoid overfitting
short_desc_tfidf = tfidf_vectorizer.fit_transform(data['short_description'])

# Combine the encoded and vectorized features
X = pd.concat(
    [
        pd.DataFrame(short_desc_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out()),
        data[['cat_item_encoded']].reset_index(drop=True)
    ],
    axis=1
)

# Ensure all column names are strings
X.columns = X.columns.astype(str)

# Encode the target 'kbid'
kbid_encoder = LabelEncoder()
data['kbid_encoded'] = kbid_encoder.fit_transform(data['kbid'])
y = data['kbid_encoded']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=kbid_encoder.classes_))



                                          precision    recall  f1-score   support

                  Basic_Mainframe_Access       1.00      1.00      1.00        15
                           Clean_Up_RACF       1.00      1.00      1.00        19
                               GuestWifi       1.00      1.00      1.00       109
      Kiteworks_Add_User_Existing_Folder       1.00      1.00      1.00        63
      Kiteworks_Create_New_Secure_Folder       1.00      1.00      1.00        66
          Kiteworks_Remove_Secure_Folder       1.00      1.00      1.00        46
            Off_Boarding_Integrity_Check       1.00      1.00      1.00        15
                                  Others       1.00      1.00      1.00      2654
                 Remove_Mainframe_Access       1.00      1.00      1.00        23
                computer_integrity_check       1.00      1.00      1.00        25
                   corporate_access_d365       1.00      1.00      1.00        52
               

In [14]:
X.columns

Index(['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
       ...
       'windows', 'wiped', 'with', 'work', 'worknotes', 'workspace', 'write',
       'z30', 'zbook', 'cat_item_encoded'],
      dtype='object', length=501)

In [15]:
# # Save the encoders and model if needed
# joblib.dump(model, 'kbid_predictor_model.pkl')
# joblib.dump(cat_item_encoder, 'cat_item_encoder.pkl')
# joblib.dump(kbid_encoder, 'kbid_encoder.pkl')
# joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

In [16]:
def predict_kbid(cat_item, short_description):
    # Encode the categorical input
    cat_item_encoded = cat_item_encoder.transform([cat_item])[0]
    
    # Vectorize the text input
    short_desc_vectorized = tfidf_vectorizer.transform([short_description]).toarray()
    
    # Create a DataFrame with matching feature names
    short_desc_df = pd.DataFrame(short_desc_vectorized, columns=tfidf_vectorizer.get_feature_names_out())
    cat_item_df = pd.DataFrame([[cat_item_encoded]], columns=['cat_item_encoded'])
    
    # Combine features and ensure column names match the training data
    input_features = pd.concat([short_desc_df, cat_item_df], axis=1)
    input_features = input_features.reindex(columns=X.columns, fill_value=0)  # Align columns
    
    # Predict the encoded KBID and decode it
    kbid_encoded = model.predict(input_features)[0]
    return kbid_encoder.inverse_transform([kbid_encoded])[0]


In [17]:
# Example prediction
example_prediction = predict_kbid("Cross-Boarding", "Cross-boarding: Review Access Request in Iron Mountain Connect (IMC)")
print("Predicted KBID:", example_prediction)

Predicted KBID: cross_boarding_imc


why you have used LabelEncoder for 'kbid' but not vectorize method like tf-idf? 
ChatGPT said:
ChatGPT
Great question! The choice of encoding method depends on the type of data and the specific requirements for modeling:

Why Use LabelEncoder for kbid?
kbid is the target (output) variable, and in this case, it is categorical data representing discrete classes (e.g., knowledge base IDs).

Advantages of LabelEncoder for Target:
Categorical Nature: Since kbid represents categories, LabelEncoder maps each unique category to a numerical value, which is ideal for classification models.
Model Compatibility: Classification models like Random Forest or Logistic Regression typically require numerical target labels to work efficiently.
Interpretability: After prediction, the numerical label can easily be mapped back to the original category using LabelEncoder.inverse_transform().
Why Not Use TF-IDF for kbid?
TF-IDF is suitable for feature extraction from textual data where the relationships between words or terms in the text are critical. For kbid, which is categorical and not textual, TF-IDF would not provide meaningful representation:

No Contextual Information: kbid does not carry text-based context that requires vectorization.
Simplicity: LabelEncoder is simpler and computationally efficient for categorical outputs.
When to Use TF-IDF?
We use TF-IDF for short_description because it is a textual feature where the relationships between words can influence the model's decision. TF-IDF converts the text into a numerical matrix suitable for machine learning.

Summary of Encoding Choices:
Feature/Variable	Nature	Encoding Used	Reason
short_description	Textual Feature	TF-IDF	Extract context and term importance.
cat_item	Categorical Feature	LabelEncoder	Compact representation of categories.
kbid (target variable)	Categorical Target	LabelEncoder	Categorical labels for classification.
Let me know if you'd like further clarification or adjustments!


In [None]:
## USE word2Vec

In [None]:
pip install gensim

In [None]:
from gensim.models import Word2Vec
import numpy as np

# Tokenize the short descriptions
data['short_description_tokenized'] = data['short_description'].str.split()

# Train a Word2Vec model
w2v_model = Word2Vec(sentences=data['short_description_tokenized'], vector_size=100, window=5, min_count=1, workers=4, sg=1)

# Generate sentence embeddings by averaging word vectors
def vectorize_sentence(tokens, model, vector_size):
    # Create an average vector for the sentence
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)

data['short_description_vector'] = data['short_description_tokenized'].apply(
    lambda tokens: vectorize_sentence(tokens, w2v_model, vector_size=100)
)

# Create the feature matrix
X = pd.DataFrame(data['short_description_vector'].tolist())  # Expand vectors into separate columns
X['cat_item_encoded'] = data['cat_item_encoded']

# Encode the target 'kbid'
data['kbid_encoded'] = kbid_encoder.fit_transform(data['kbid'])
y = data['kbid_encoded']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=kbid_encoder.classes_))
