**Initial Setup**

In [None]:
# Import Libraries
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import hstack

**Data Preprocessing**

In [None]:
# Load the dataset
file_path = '/Users/key.houck/Desktop/parts-ml-poc/data/at_service_job_AT_SJ_PART_AT_SO_PRODUCT_AT_SA_SERVICE_JOB_PRODUC_202402141028.csv'
data = pd.read_csv(file_path)

# Drop rows with any null values
data.dropna(inplace=True)

# Reset the index after dropping rows
data.reset_index(drop=True, inplace=True)

**Handling High Cardinality Categorical Features**

In [None]:
# Frequency encoding for 'BRAND_NAME'
brand_freq = data['BRAND_NAME'].value_counts().to_dict()
data['brand_freq'] = data['BRAND_NAME'].map(brand_freq)

# Frequency encoding for 'model_number'
model_freq = data['MODEL_NUMBER'].value_counts().to_dict()
data['model_freq'] = data['MODEL_NUMBER'].map(model_freq)

**Prepare Target Variable for Multi-Label Classification, Text Data Vectorization**

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

# Clean text columns
text_columns = ['DESCRIPTION', 'PROBLEM_DESCRIPTION', 'SERVICE_EXPLANATION']
for col in text_columns:
    data[col] = data[col].apply(clean_text)

# Group and prepare the multi-label target variable
grouped_parts = data.groupby('SERVICE_JOB_ID')['PART_NUMBER_ORDERED'].apply(lambda x: list(set(x))).reset_index()
mlb = MultiLabelBinarizer()
multi_hot_labels = mlb.fit_transform(grouped_parts['PART_NUMBER_ORDERED'])

# Ensure the grouped 'SERVICE_JOB_ID' is the same order as in 'data'
data = data.set_index('SERVICE_JOB_ID').loc[grouped_parts['SERVICE_JOB_ID']].reset_index()

# Now, vectorize the cleaned text columns
tfidf_vectorizer_descr = TfidfVectorizer(max_features=1000)
description_tfidf = tfidf_vectorizer_descr.fit_transform(data['DESCRIPTION'])

tfidf_vectorizer_prob = TfidfVectorizer(max_features=1000)
problem_description_tfidf = tfidf_vectorizer_prob.fit_transform(data['PROBLEM_DESCRIPTION'])

tfidf_vectorizer_serv = TfidfVectorizer(max_features=1000)
service_explanation_tfidf = tfidf_vectorizer_serv.fit_transform(data['SERVICE_EXPLANATION'])

# Stack the TF-IDF matrices horizontally
features = hstack([description_tfidf, problem_description_tfidf, service_explanation_tfidf])

# Verify alignment between features and labels
assert features.shape[0] == multi_hot_labels.shape[0]

****

**Model Training**

In [None]:
# Split the data (use the features DataFrame you prepared earlier)
X_train, X_test, y_train, y_test = train_test_split(features, multi_hot_labels, test_size=0.2, random_state=42)

# Initialize MultiOutputClassifier with RandomForest
multi_label_model = MultiOutputClassifier(RandomForestClassifier(random_state=42))

# Train the model
multi_label_model.fit(X_train, y_train)

**Model Evaluation**

**Making Predictions**