In [2]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import time
from scipy.sparse import hstack
import warnings
from sklearn.base import BaseEstimator, TransformerMixin
warnings.filterwarnings('ignore')
import joblib

In [3]:
data = pd.read_csv('../data/train_set.csv')
data = data.sample(frac=1, random_state=1)
train1 = data.drop_duplicates('Product_Category', keep='first')

train, test = train_test_split(data[~data.index.isin(train1.index)] ,test_size=0.30,random_state = 1)
train = pd.concat([train1, train])
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

##### Baseline Solution (without using Item_Description)

In [4]:
le = LabelEncoder()

train['Vendor_Code_numeric'] = train.Vendor_Code.str[-4:].astype(int)
train['GL_Code_numeric'] = train.GL_Code.str[-7:].astype(int)
train['Product_Category_numeric'] = le.fit_transform(train.Product_Category)

In [5]:
baseline_model = XGBClassifier()
time_check=[]

kf = StratifiedKFold(n_splits=3)
train1 = train.drop_duplicates('Product_Category', keep='first')
train2 = train[~train.index.isin(train1.index)].reset_index()
accuracy = []

for i, (train_index, test_index) in enumerate(kf.split(train2, train2.Product_Category_numeric)):
    inp = train2[train2.index.isin(train_index)]
    inp = pd.concat([inp, train1])
    
    trainx = inp[['Vendor_Code_numeric', 'GL_Code_numeric', 'Inv_Amt']]
    trainy = inp.Product_Category_numeric

    testx = train2[['Vendor_Code_numeric', 'GL_Code_numeric', 'Inv_Amt']][train2.index.isin(test_index)]
    testy = train2[train2.index.isin(test_index)].Product_Category_numeric

    baseline_model.fit(trainx, trainy)

    t = time.time()
    acc = accuracy_score(testy, baseline_model.predict(testx))
    time_check.append(time.time() - t)
    accuracy.append(acc)

print('Base line accuracy score is {:.2f}%'.format(np.mean(accuracy)*100))
print('Time taken for inference {:.2f}s'.format(np.mean(time_check)))

Base line accuracy score is 88.08%
Time taken for inference 0.02s


##### Model 1 (Only using description, TfIdf Vectorizer)

In [6]:
def preprocess_txt(inp):
    inp = inp.astype(str)
    inp = inp.replace(r'\w*\d\w*', '', regex=True)
    inp = inp.replace(r'\d+', '', regex=True)  # Remove numbers
    inp = inp.replace(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', '', regex=True)  # Remove dates
    inp = inp.replace(r'[^\w\s]', ' ', regex=True)  # Remove special characters
    inp = inp.str.strip()  # Remove leading/trailing whitespaces
    inp = inp.str.lower()
    inp = inp.replace(r'([a-z])\1+', '', regex=True)    # Use str.replace() with a regular expression pattern to remove repeated single letters
    inp = inp.replace(r' +', ' ', regex=True)

    return inp

In [7]:
train['Item_Description_Preprocessed'] = preprocess_txt(train.Item_Description)

In [8]:
model1 = XGBClassifier()
time_check=[]
tfidf = TfidfVectorizer()
tfidf = TfidfVectorizer(stop_words = 'english')

kf = StratifiedKFold(n_splits=3)
train1 = train.drop_duplicates('Product_Category', keep='first')
train2 = train[~train.index.isin(train1.index)].reset_index()
accuracy = []

for i, (train_index, test_index) in enumerate(kf.split(train2, train2.Product_Category_numeric)):
    inp = train2[train2.index.isin(train_index)]
    inp = pd.concat([inp, train1])

    trainx = tfidf.fit_transform(inp['Item_Description_Preprocessed'])
    trainy = inp.Product_Category_numeric


    model1.fit(trainx, trainy)
    t = time.time()
    testx = tfidf.transform(train2['Item_Description_Preprocessed'][train2.index.isin(test_index)])
    testy = train2[train2.index.isin(test_index)].Product_Category_numeric
    acc = accuracy_score(testy, model1.predict(testx))
    time_check.append(time.time() - t)
    accuracy.append(acc)

print('Model accuracy score is {:.2f}%'.format(np.mean(accuracy)*100))
print('Time taken for inference {:.2f}s'.format(np.mean(time_check)))

Model accuracy score is 99.59%
Time taken for inference 0.03s


##### Model 2 (Using all the data)

In [9]:
train['New_description_column'] = train.Vendor_Code + ' ' + train.GL_Code + ' ' + train.Item_Description_Preprocessed

In [10]:
model2 = XGBClassifier()
time_check=[]
tfidf = TfidfVectorizer(stop_words = 'english')

kf = StratifiedKFold(n_splits=3)

train1 = train.drop_duplicates('Product_Category', keep='first')
train2 = train[~train.index.isin(train1.index)].reset_index()
accuracy = []

for i, (train_index, test_index) in enumerate(kf.split(train2, train2.Product_Category_numeric)):
    inp = train2[train2.index.isin(train_index)]
    inp = pd.concat([inp, train1])

    trainx = tfidf.fit_transform(inp['New_description_column'])
    trainy = inp.Product_Category_numeric

    model2.fit(trainx, trainy)
    t = time.time()
    testx = tfidf.transform(train2['New_description_column'][train2.index.isin(test_index)])
    testy = train2[train2.index.isin(test_index)].Product_Category_numeric
    acc = accuracy_score(testy, model2.predict(testx))
    time_check.append(time.time() - t)
    accuracy.append(acc)

print('Model accuracy score is {:.2f}%'.format(np.mean(accuracy)*100))
print('Time taken for inference {:.2f}s'.format(np.mean(time_check)))

Model accuracy score is 99.54%
Time taken for inference 0.03s


##### Model 3 (Using count vectorizer)

In [11]:
train['New_description_column'] = train.Vendor_Code + ' ' + train.GL_Code + ' ' + train.Item_Description_Preprocessed

In [12]:
model3 = XGBClassifier()
time_check=[]
cvr = CountVectorizer(stop_words = 'english')
kf = StratifiedKFold(n_splits=3)

train1 = train.drop_duplicates('Product_Category', keep='first')
train2 = train[~train.index.isin(train1.index)].reset_index()
accuracy = []

for i, (train_index, test_index) in enumerate(kf.split(train2, train2.Product_Category_numeric)):
    inp = train2[train2.index.isin(train_index)]
    inp = pd.concat([inp, train1])

    trainx = cvr.fit_transform(inp['New_description_column'])
    trainy = inp.Product_Category_numeric

    model3.fit(trainx, trainy)
    t = time.time()
    testx = cvr.transform(train2['New_description_column'][train2.index.isin(test_index)])
    testy = train2[train2.index.isin(test_index)].Product_Category_numeric
    acc = accuracy_score(testy, model3.predict(testx))
    time_check.append(time.time() - t)
    accuracy.append(acc)

print('Model accuracy score is {:.2f}%'.format(np.mean(accuracy)*100))
print('Time taken for inference {:.2f}s'.format(np.mean(time_check)))

Model accuracy score is 99.95%
Time taken for inference 0.04s


##### Model 4

In [13]:
train['New_description_column'] = train.Vendor_Code + ' ' + train.GL_Code

In [14]:
model3 = XGBClassifier()
time_check=[]
cvr = CountVectorizer(stop_words = 'english')
tfidf = TfidfVectorizer(stop_words = 'english')

kf = StratifiedKFold(n_splits=3)

train1 = train.drop_duplicates('Product_Category', keep='first')
train2 = train[~train.index.isin(train1.index)].reset_index()
accuracy = []

for i, (train_index, test_index) in enumerate(kf.split(train2, train2.Product_Category_numeric)):
    inp = train2[train2.index.isin(train_index)]
    inp = pd.concat([inp, train1])

    trainx = hstack([cvr.fit_transform(inp['New_description_column']), tfidf.fit_transform(inp['Item_Description_Preprocessed'])])
    trainy = inp.Product_Category_numeric

    model3.fit(trainx, trainy)
    t = time.time()
    testx = hstack([cvr.transform(train2['New_description_column'][train2.index.isin(test_index)]), tfidf.transform(train2['Item_Description_Preprocessed'][train2.index.isin(test_index)])]) 
    testy = train2[train2.index.isin(test_index)].Product_Category_numeric
    acc = accuracy_score(testy, model3.predict(testx))
    time_check.append(time.time() - t)
    accuracy.append(acc)

print('Model accuracy score is {:.2f}%'.format(np.mean(accuracy)*100))
print('Time taken for inference {:.2f}s'.format(np.mean(time_check)))

Model accuracy score is 99.59%
Time taken for inference 0.04s


Best Model Till Now is Model 3

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [16]:
class text_preprocessor(BaseEstimator, TransformerMixin):
  def __init__(self):
    return None
  
  def preprocess_txt(self, inp):
    inp = inp.astype(str)
    inp = inp.replace(r'\w*\d\w*', '', regex=True)
    inp = inp.replace(r'\d+', '', regex=True)  # Remove numbers
    inp = inp.replace(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', '', regex=True)  # Remove dates
    inp = inp.replace(r'[^\w\s]', ' ', regex=True)  # Remove special characters
    inp = inp.str.strip()  # Remove leading/trailing whitespaces
    inp = inp.str.lower()
    inp = inp.replace(r'([a-z])\1+', '', regex=True)    # Use str.replace() with a regular expression pattern to remove repeated single letters
    inp = inp.replace(r' +', ' ', regex=True)
    return inp
  
  def fit(self, x, y=None):
    return self.fit_transform(x)
  
  def transform(self, x):
    return  self.fit_transform(x)
  
  def fit_transform(self, x, y=0):
    return preprocess_txt(x)

In [17]:
class target_preprocessor(BaseEstimator, TransformerMixin):
  def __init__(self):
    return None
  
  def fit(self, x, y):
    return self.fit_transform(y)
  
  def transform(self, x):
    return  self.fit_transform(x)
  
  def fit_transform(self, x):
    return preprocess_txt(x)
  
custom_transformer = Pipeline(steps=[
    ('text_process', text_preprocessor()),
    ('count_vectorize', CountVectorizer(stop_words = 'english'))
    ])

preprocessing = ColumnTransformer(
    [
        ("count_vectorize_Vendor_Code", CountVectorizer(stop_words = 'english'), ('Vendor_Code')),
        ("count_vectorize_GL_Code", CountVectorizer(stop_words = 'english'), ('GL_Code')),
        ("text_preprocess", custom_transformer, ('Item_Description')),
    ],
    verbose_feature_names_out=False,
)
xgb = Pipeline(
    [
        ("preprocess", preprocessing),
        ("classifier", XGBClassifier(random_state=42)),
    ]
)

kf = StratifiedKFold(n_splits=3)

train1 = train.drop_duplicates('Product_Category', keep='first')
train2 = train[~train.index.isin(train1.index)].reset_index()
accuracy = []

for i, (train_index, test_index) in enumerate(kf.split(train2, train2.Product_Category_numeric)):
    inp = train2[train2.index.isin(train_index)]
    inp = pd.concat([inp, train1])

    xgb.fit(inp[['Item_Description', 'Vendor_Code', 'GL_Code']], inp.Product_Category_numeric)
    t = time.time()
    testx = train2[train2.index.isin(test_index)][['Item_Description', 'Vendor_Code', 'GL_Code']]
    testy = train2[train2.index.isin(test_index)].Product_Category_numeric
    acc = accuracy_score(testy, xgb.predict(testx))
    time_check.append(time.time() - t)
    accuracy.append(acc)

print('Model accuracy score is {:.2f}%'.format(np.mean(accuracy)*100))
print('Time taken for inference {:.2f}s'.format(np.mean(time_check)))

Model accuracy score is 99.95%
Time taken for inference 0.06s


In [18]:
xgb.fit(train[['Item_Description', 'Vendor_Code', 'GL_Code']], train.Product_Category_numeric)

In [None]:
joblib.dump(xgb, '../models/champion_model.pkl') 
joblib.dump(le, '../models/label_encoder.pkl') 