In [1]:
# Import Libraries
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, hamming_loss, f1_score

from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import hstack

import scipy.sparse as sp

**Data Cleaning**

In [2]:
# Load the dataset
file_path = '/Users/key.houck/Desktop/parts-ml-poc/data/at_service_job_AT_SJ_PART_AT_SO_PRODUCT_AT_SA_SERVICE_JOB_PRODUC_202402141028.csv'
data = pd.read_csv(file_path)

# Replace '[NULL]' with NaN and then drop rows with any NaN values
data = data.replace('[NULL]', pd.NA).dropna()

# TODO: Only filter for SAMSUNG ELECTRONICS in BRAND_NAME
data = data[data['BRAND_NAME'] == 'SAMSUNG ELECTRONICS']

# Since the 'SERVICE_JOB_ID' will not be used as a feature, we'll exclude it from the feature set
data_features = data.drop(columns=['SERVICE_JOB_ID', 'PART_NUMBER_ORDERED', 'PART_DESCRIPTION', 'SERVICE_EXPLANATION'])
print(data.dtypes)

SERVICE_JOB_ID         object
DESCRIPTION            object
BRAND_NAME             object
MODEL_NUMBER           object
PROBLEM_DESCRIPTION    object
SERVICE_EXPLANATION    object
PART_NUMBER_ORDERED    object
PART_DESCRIPTION       object
dtype: object


**Encode, Vectorize, and multi-label**

In [3]:
# Clean the PROBLEM_DESCRIPTION text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

data['PROBLEM_DESCRIPTION_CLEANED'] = data['PROBLEM_DESCRIPTION'].apply(clean_text)

# Encode categorical features
label_encoders = {}
for column in ['DESCRIPTION', 'BRAND_NAME', 'MODEL_NUMBER']:
    le = LabelEncoder()
    data[column + '_ENCODED'] = le.fit_transform(data[column])
    label_encoders[column] = le

# Vectorize the cleaned PROBLEM_DESCRIPTION
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
problem_description_tfidf = tfidf_vectorizer.fit_transform(data['PROBLEM_DESCRIPTION_CLEANED'])

# Combine encoded categorical features and TF-IDF vectorized text into one feature set
categorical_features = ['DESCRIPTION_ENCODED', 'BRAND_NAME_ENCODED', 'MODEL_NUMBER_ENCODED']
encoded_categorical = data[categorical_features].values
all_features = hstack([encoded_categorical, problem_description_tfidf])

# Prepare the target for multi-label classification
mlb = MultiLabelBinarizer()
multi_label_target = mlb.fit_transform(data['PART_NUMBER_ORDERED'].apply(lambda x: set(x.split(','))))

**Model Training and Evaluation**

In [4]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(all_features, multi_label_target, test_size=0.2, random_state=42)

# Initialize and train the MultiOutputClassifier with RandomForestClassifier as the base estimator
model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
model.fit(X_train, y_train)

In [31]:
# Predict on the test set
y_pred = model.predict(X_test)

# Hamming Loss: Lower values are better, indicating fewer incorrect labels.
hamming_loss_value = hamming_loss(y_test, y_pred)
print(f"Hamming Loss: {hamming_loss_value}")

# F1 Score: Harmonic mean of precision and recall. A higher score indicates better performance.
# Calculating for each label ('micro') gives equal weight to all labels.
f1_score_micro = f1_score(y_test, y_pred, average='micro')
print(f"F1 Score (Micro): {f1_score_micro}")

# Calculating for each instance ('samples') and then averaging is also useful.
f1_score_samples = f1_score(y_test, y_pred, average='samples')
print(f"F1 Score (Samples): {f1_score_samples}")

# Subset Accuracy: Exact match score you've already computed. Reiterating for context.
subset_accuracy = accuracy_score(y_test, y_pred)
print(f"Subset Accuracy (Exact Match): {subset_accuracy}")

Hamming Loss: 0.00019622074488433974
F1 Score (Micro): 0.10494425593795444
F1 Score (Samples): 0.0712171052631579
Subset Accuracy (Exact Match): 0.0712171052631579


In [39]:
def predict_parts(description, brand_name, model_number, problem_description):
    # Clean the problem description text
    cleaned_problem_description = clean_text(problem_description)  # Assuming `clean_text` is your text cleaning function
    
    print(label_encoders)

    # Encode categorical features
    description_encoded = label_encoders['DESCRIPTION'].transform([description])  # Assuming le_description is your fitted LabelEncoder for DESCRIPTION
    brand_name_encoded = label_encoders['BRAND_NAME'].transform([brand_name])  # Assuming le_brand_name is your fitted LabelEncoder for BRAND_NAME
    model_number_encoded = label_encoders['MODEL_NUMBER'].transform([model_number])  # Assuming le_model_number is your fitted LabelEncoder for MODEL_NUMBER
    
    # Vectorize the problem description
    problem_description_vectorized = tfidf_vectorizer.transform([cleaned_problem_description])  # tfidf_vectorizer is your fitted TfidfVectorizer
    
    # Combine all features into a single feature vector
    features = hstack([description_encoded.reshape(-1, 1), brand_name_encoded.reshape(-1, 1), model_number_encoded.reshape(-1, 1), problem_description_vectorized])
    
    # Predict using the trained model
    prediction = model.predict(features)
    
    # Convert the prediction back to the list of parts
    predicted_parts = mlb.inverse_transform(prediction)  # mlb is your fitted MultiLabelBinarizer
    
    return predicted_parts

# Example usage
predicted_parts = predict_parts('HOME REFRIGERATION', 'SAMSUNG ELECTRONICS', 'RF28HMEDBSR/AA', 'Ice tray is leaking')
print(predicted_parts)

{'DESCRIPTION': LabelEncoder(), 'BRAND_NAME': LabelEncoder(), 'MODEL_NUMBER': LabelEncoder()}
[()]
