***Initial Setup***

In [18]:
# Import Libraries
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

from sklearn.preprocessing import MultiLabelBinarizer

In [3]:
# Ensure plots are displayed inline within notebook
%matplotlib inline

In [4]:
# Load the dataset (change to abs path if needed)
file_path =  '/Users/key.houck/Desktop/parts-ml-poc/data/at_service_job_AT_SJ_PART_AT_SO_PRODUCT_AT_SA_SERVICE_JOB_PRODUC_202402141028.csv'
data = pd.read_csv(file_path)
print(data.head())

  SERVICE_JOB_ID                  DESCRIPTION           BRAND_NAME  \
0     SJ59932746           HOME REFRIGERATION  SAMSUNG ELECTRONICS   
1     SJ60818568                       LAPTOP                 DELL   
2     SJ60688618  REFRIGERATION – RESIDENTIAL           KITCHENAID   
3     SJ65613592            VIDEO GAME PLAYER                 SONY   
4     SJ59426407            HOME COOKING ELEC     GENERAL ELECTRIC   

     MODEL_NUMBER                                PROBLEM_DESCRIPTION  \
0  RFG297ABWP/XAA                                             [NULL]   
1            P90F  Problem Description: Problem Description: Prob...   
2    KRMF706ESS01  Rework: ice maker not working, rework approved...   
3          PS4PRO  Problem Description: not working after the pow...   
4    JGB720SEJ5SS  Rework: damage to oven door handle on prior visit   

                                 SERVICE_EXPLANATION PART_NUMBER_ORDERED  \
0  Customer call in for weak cooling after troubl...         DA32-1010

In [5]:
# Basic data exploration
print("Dataset shape:", data.shape)
print("Columns in the dataset:", data.columns.tolist())
print("Basic statistical details:")
print(data.describe(include='all'))

Dataset shape: (30632, 8)
Columns in the dataset: ['SERVICE_JOB_ID', 'DESCRIPTION', 'BRAND_NAME', 'MODEL_NUMBER', 'PROBLEM_DESCRIPTION', 'SERVICE_EXPLANATION', 'PART_NUMBER_ORDERED', 'PART_DESCRIPTION']
Basic statistical details:
       SERVICE_JOB_ID        DESCRIPTION           BRAND_NAME MODEL_NUMBER  \
count           30632              30632                30632        30508   
unique          15701                 66                  119         4897   
top        SJ59582048  HOME LAUNDRY ELEC  SAMSUNG ELECTRONICS       SWITCH   
freq               20               6657                15147          988   

       PROBLEM_DESCRIPTION                                SERVICE_EXPLANATION  \
count                30631                                              30631   
unique               15140                                              12103   
top                 #NAME?  REPLACED SCREEN ASSEMBLY - CLEANED UNIT - PASS...   
freq                    92                             

In [6]:
# Missing Values 
print("Missing values in each column:")
print(data.isnull().sum())

Missing values in each column:
SERVICE_JOB_ID           0
DESCRIPTION              0
BRAND_NAME               0
MODEL_NUMBER           124
PROBLEM_DESCRIPTION      1
SERVICE_EXPLANATION      1
PART_NUMBER_ORDERED      1
PART_DESCRIPTION        10
dtype: int64


***Cleaning and Preprocessing Data***

In [7]:
# Remove data with null or na and reset index
cleaned_data = data.dropna()
cleaned_data.reset_index(drop=True, inplace=True)
print("\nMissing values after removal:")
print(cleaned_data.isnull().sum())


Missing values after removal:
SERVICE_JOB_ID         0
DESCRIPTION            0
BRAND_NAME             0
MODEL_NUMBER           0
PROBLEM_DESCRIPTION    0
SERVICE_EXPLANATION    0
PART_NUMBER_ORDERED    0
PART_DESCRIPTION       0
dtype: int64


In [8]:
# Clean text for DESCRIPTION, PROBLEM_DESCRIPTION, SERVICE_DESCRIPTION
# Convert text to lowercase and remove punctuation and numbers
# Remove stop words and lemmatize
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

# Clean text columns
text_columns = ['DESCRIPTION', 'PROBLEM_DESCRIPTION', 'SERVICE_EXPLANATION']
for col in text_columns:
    cleaned_data[col] = cleaned_data[col].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data[col] = cleaned_data[col].apply(clean_text)


***Feature Extraction & Encoding Categorical Values***

In [9]:
# Convert text data to numerical format
tfid_vectorizer = TfidfVectorizer(max_features=1000)
description_tfidf = tfid_vectorizer.fit_transform(cleaned_data['DESCRIPTION'])
problem_description_tfidf = tfid_vectorizer.fit_transform(cleaned_data['PROBLEM_DESCRIPTION'])
service_explanation_tfidf = tfid_vectorizer.fit_transform(cleaned_data['SERVICE_EXPLANATION'])

print("Shape of 'DESCRIPTION' TF-IDF Matrix:", description_tfidf.shape)
print("Shape of 'PROBLEM_DESCRIPTION' TF-IDF Matrix:", problem_description_tfidf.shape)
print("Shape of 'SERVICE_EXPLANATION' TF-IDF Matrix:", service_explanation_tfidf.shape)

Shape of 'DESCRIPTION' TF-IDF Matrix: (30499, 76)
Shape of 'PROBLEM_DESCRIPTION' TF-IDF Matrix: (30499, 1000)
Shape of 'SERVICE_EXPLANATION' TF-IDF Matrix: (30499, 1000)


In [10]:
# Frequency encoding for BRAND_NAME and MODEL_NUMBER
brand_name_freq = cleaned_data['BRAND_NAME'].value_counts().to_dict()
cleaned_data['BRAND_NAME_FREQ'] = cleaned_data['BRAND_NAME'].map(brand_name_freq)

model_number_freq = cleaned_data['MODEL_NUMBER'].value_counts().to_dict()
cleaned_data['MODEL_NUMBER_FREQ'] = cleaned_data['MODEL_NUMBER'].map(model_number_freq)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data['BRAND_NAME_FREQ'] = cleaned_data['BRAND_NAME'].map(brand_name_freq)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data['MODEL_NUMBER_FREQ'] = cleaned_data['MODEL_NUMBER'].map(model_number_freq)


In [11]:
# TF-IDF vectors in matrix and cat in dataframe
# Convert the TF-IDF sparse matrices to DataFrames
description_tfidf_df = pd.DataFrame(description_tfidf.toarray(), columns=[f'descr_tfidf_{i}' for i in range(description_tfidf.shape[1])])
problem_description_tfidf_df = pd.DataFrame(problem_description_tfidf.toarray(), columns=[f'prob_descr_tfidf_{i}' for i in range(problem_description_tfidf.shape[1])])
service_explanation_tfidf_df = pd.DataFrame(service_explanation_tfidf.toarray(), columns=[f'serv_expl_tfidf_{i}' for i in range(service_explanation_tfidf.shape[1])])

# Ensure the index matches after dropping NA rows
description_tfidf_df.index = cleaned_data.index
problem_description_tfidf_df.index = cleaned_data.index
service_explanation_tfidf_df.index = cleaned_data.index

# Combine the TF-IDF features with the frequency-encoded features
features_df = pd.concat([cleaned_data[['BRAND_NAME_FREQ', 'MODEL_NUMBER_FREQ']], description_tfidf_df, problem_description_tfidf_df, service_explanation_tfidf_df], axis=1)

# Define the target variable
target = cleaned_data['PART_NUMBER_ORDERED']

***Split and Training***

In [12]:
# Split and Train Data
X_train, X_test, y_train, y_test = train_test_split(features_df, target, test_size=0.2, random_state=42)


In [13]:
# Ensemble technique
# rf_model = RandomForestClassifier(random_state=42)

# Train the model
# rf_model.fit(X_train, y_train)

In [14]:
# Predict and evaluate the model
# y_pred = rf_model.predict(X_test)
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))

***Multi-Label Classification Attempt***

In [17]:
# Group by 'service_job_id' and aggregate the unique 'part_number_ordered' into lists
grouped_data = cleaned_data.groupby('SERVICE_JOB_ID').agg({
    'PART_NUMBER_ORDERED': lambda x: list(x.unique())
}).reset_index()

***Prepare the Features & Encode Targets***

In [21]:
# Join the features with the grouped data to ensure each 'SERVICE_JOB_ID' matches its features
prepared_features = grouped_data.join(features_df.set_index('SERVICE_JOB_ID'), on='SERVICE_JOB_ID')

<bound method NDFrame.head of        BRAND_NAME_FREQ  MODEL_NUMBER_FREQ  descr_tfidf_0  descr_tfidf_1  \
0                15119                  1            0.0            0.0   
1                  472                 27            0.0            0.0   
2                  273                 29            0.0            0.0   
3                  687                 20            0.0            0.0   
4                 2539                  1            0.0            0.0   
...                ...                ...            ...            ...   
30494             1541                  4            0.0            0.0   
30495            15119                146            0.0            0.0   
30496            15119                146            0.0            0.0   
30497            15119                146            0.0            0.0   
30498            15119                146            0.0            0.0   

       descr_tfidf_2  descr_tfidf_3  descr_tfidf_4  descr_tfidf_5  \
