# Importing the dataframe

In [None]:
import os
os.chdir('/content/drive/My Drive/Capstone')

In [None]:
import pandas as pd
data = pd.read_excel('fiscal_y1.xlsx')
data = data[['Item Name', 'Item Description', 'Commodity Title']] #only interested in these 3 columns

# Pre-processing

We will pre-process the data so that they can be inputs for the baseline models later.

Steps involved here:

    1. Clean the class labels

    2. Removing rows which have empty Item Name and Item Description

    3. Concatenating the Item Name and Item Description Columns so that we do not need to have 2 bags of words later.

    4. Text Cleaning

      - Remove Punctuation
      - Lowercase all the words
      - Tokenize the texts, meaning that each word will become a token
      - Removing stopwords. Stopwords are commonly used words such as 'the', 'she', 'he'.
      - Lemmatizing. Group words to a similar word. For example, 'good', 'better' and 'best' will be changed to 'good'.
      - Stemming. To convert the words to their root form. For example, 'eating' will be changed to 'eat'.
      - Removing special characters, such as '$', '%'
      - Removing numbers
      - Removing line breaks. They will appear as '\r' or '\n' in the text.
      - Removing single character. They are probably just typos.

Step 1: Cleaning of class labels

We noticed that there are some rows with spacing as well as \t and \n in the labels. We need to clean them so that the model will not treat the label with a spacing as a different label.

In [None]:
for index, row in data.iterrows():
  label = str(row['Commodity Title'])
  if '\t' in label:
    data.at[index, 'Commodity Title'] = str(row['Commodity Title']).replace('\t', '')
  if '\n' in label:
    data.at[index, 'Commodity Title'] = str(row['Commodity Title']).replace('\n', '')
  data.at[index, 'Commodity Title'] = str(row['Commodity Title']).strip(' ')

The code below describes steps 2 and 3.

In [None]:
texts = []
classes = []
for index, row in data.iterrows():
  classname = row['Commodity Title']
  if len(str(row['Item Name'])) == 0 and len(str(row['Item Description'])) == 0: #whole row is empty
    continue
  if pd.isna(row['Item Description']): #keeping only 1 of the columns if the other column is empty
    text = str(row['Item Name'])
  elif pd.isna(row['Item Name']):
    text = str(row['Item Description'])
  else:
    text = str(row['Item Name']) + " " + str(row['Item Description']) #concatenate both text columns to form just 1 column
  if not pd.isna(classname) and not pd.isna(text): #only want rows with non-empty classes and non-empty text
    texts.append(text)
    classes.append(classname)

df = pd.DataFrame()
df['Class'] = classes
df['Text'] = texts

df.head()

Step 4: Text Cleaning

We will write a function so that it is easier to automate the text cleaning later.

In [None]:
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')

def text_cleaning(dataframe):
  new_df = pd.DataFrame()
  texts = []
  classes = []
  tokenizer = RegexpTokenizer(r'\w+')
  stopwords_lst = set(stopwords.words('english'))
  lemmatizer = WordNetLemmatizer()
  stemmer = PorterStemmer()
  for index, row in dataframe.iterrows():
    text = row['Text']
    #remove punctuation
    text = ''.join([i for i in text if i not in string.punctuation])
    #lowercase 
    text = ''.join([i.lower() for i in text])
    #tokenize
    text = ' '.join(tokenizer.tokenize(text))
    #remove stopwords
    text = ' '.join([i for i in text.split() if i not in stopwords_lst])
    #lemmatize
    text = ' '.join([lemmatizer.lemmatize(i) for i in text.split()])
    #stemming
    text = ' '.join([stemmer.stem(i) for i in text.split()])
    #remove special characters
    text = re.sub('[^A-Za-z0-9]+', ' ', text)
    #remove numbers
    text = ' '.join([i for i in text.split() if i.isalpha()])
    #remove line breaks
    text = text.replace('\n', " ")
    text = text.replace('\r', " ")
    #remove single character
    text = ' '.join([w for w in text.split() if len(w)>1] )
    classes.append(row['Class'])
    texts.append(text)
  new_df['Class'] = classes
  new_df['Text'] = texts
  return new_df

In [None]:
cleaned_df = text_cleaning(df)

cleaned_df.head()

Next, we will split the data into train and test so that we can see how well the models perform out of sample. But unlike the conventional train-test-split, we will be using a stratified train test split instead. We have seen from our EDA that there is a huge imbalance of classes so we want to address this problem.

We will first separate those classes with the number of records below the threshold. They will be in the training set but won't appear in the testing set.These rows will be in the excess_df, while the bulk of the data will be in the new_df, as shown below.

In [None]:
# record the number of records for each class
unique_classes = cleaned_df['Class'].unique()
unc = {}
for i in unique_classes:
  unc[i] = cleaned_df[cleaned_df['Class'] == i].shape[0]
pd.Series(list(unc.values())).describe()

In [None]:
# remove the classes with < 20 rows
texts = []
classes = []
excess_texts = []
excess_classes = []
for index, row in cleaned_df.iterrows():
  curr_class = row['Class']
  if unc[curr_class] >= 20:
    texts.append(row['Text'])
    classes.append(curr_class)
  else:
    excess_texts.append(row['Text'])
    excess_classes.append(curr_class)

new_df = pd.DataFrame()
new_df['Class'] = classes
new_df['Text'] = texts
excess_df = pd.DataFrame()
excess_df['Class'] = excess_classes
excess_df['Text'] = excess_texts
print(new_df.shape)
print(excess_df.shape)

In [None]:
# vectorizing all texts with tfidf
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.concat([new_df, excess_df])
tfidf = TfidfVectorizer()
overall_mat = tfidf.fit_transform(df.Text)
overall_y = df.Class

From the overall matrix, we will split it into 2 parts. The excess matrix is for the rows in the excess df, while the main matrix is for the new df.

In [None]:
excess_classes = set(list(excess_df.Class.unique())) #retrieve the list of classes that are in excess_df
main_idxs = []
excess_idxs = []
idx = 0
for i in overall_y.values: #the idea is to assign the indexes to the corresponding group based on whether the value is in the excess_classes or not.
  if i in excess_classes:
    excess_idxs.append(idx)
  else:
    main_idxs.append(idx)
  idx += 1

main_mat = overall_mat[main_idxs]
main_y = overall_y.values[main_idxs]
excess_mat = overall_mat[excess_idxs]
excess_y = overall_y.values[excess_idxs]
print(main_mat.shape)
print(main_y.shape)
print(excess_mat.shape)
print(excess_y.shape)

Here, we did a stratified split only on the new_df.

In [None]:
# perform stratified split only on the new_df 
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 0)
for train_index, test_index in split.split(new_df, new_df['Class']):
  strat_train_set = train_index
  strat_test_set = test_index

Splitting the main matrix only into train and test sets. The excess matrix will be fitted into the models directly.

In [None]:
# convert them into required input shape
corpus_train_mat = main_mat[strat_train_set,]
corpus_test_mat = main_mat[strat_test_set,]
y_train = main_y[strat_train_set]
y_test = main_y[strat_test_set]

print(corpus_train_mat.shape)
print(corpus_test_mat.shape)
print(y_train.shape)
print(y_test.shape)

# Baseline Models

Now, we need to build baseline models to see how accurate the current labels are.

Models:

  1. Naive Bayes

  2. Linear SVM

  3. Logistic Regression

  4. LSTM

The steps for each model are the same for the first 3 models, the last model will require some more work because it is a neural network. We will fit the matrix into the model, then predict on the test set and check the evaluation metrics. We will be focusing on the f1 score and accuracy score.

## Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
nbclassifier = MultinomialNB()
nbclassifier.fit(excess_mat, excess_y)
nbclassifier.fit(corpus_train_mat, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
#predict class from test data 
predicted = nbclassifier.predict(corpus_test_mat)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print("Accuracy: " + str(accuracy_score(y_test, predicted)))
print("Precision: " + str(precision_score(y_test, predicted, average = 'weighted')))
print("Recall: " + str(recall_score(y_test, predicted, average = 'weighted')))
print("F1 Score: " + str(f1_score(y_test, predicted, average = 'weighted')))

Accuracy: 0.40567294879116694
Precision: 0.3940133815195956


  _warn_prf(average, modifier, msg_start, len(result))


Recall: 0.40567294879116694
F1 Score: 0.33955390390274726


## Linear SVM

In [None]:
from sklearn.svm import SVC

text_clf = SVC(kernel='linear', probability = True)

In [None]:
#train model
text_clf.fit(excess_mat, excess_y)
text_clf.fit(corpus_train_mat, y_train)

In [None]:
#predict class from test data 
predicted = text_clf.predict(corpus_test_mat)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print("Accuracy: " + str(accuracy_score(y_test, predicted)))
print("Precision: " + str(precision_score(y_test, predicted, average = 'weighted')))
print("Recall: " + str(recall_score(y_test, predicted, average = 'weighted')))
print("F1 Score: " + str(f1_score(y_test, predicted, average = 'weighted')))

Accuracy: 0.6548638873024938
Precision: 0.6686171504585076
Recall: 0.6548638873024938


  _warn_prf(average, modifier, msg_start, len(result))


F1 Score: 0.6377390457188284


## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, solver='sag')
clf.fit(excess_mat, excess_y)
clf.fit(corpus_train_mat, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
#predict class from test data 
predicted = clf.predict(corpus_test_mat)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print("Accuracy: " + str(accuracy_score(y_test, predicted)))
print("Precision: " + str(precision_score(y_test, predicted, average = 'weighted')))
print("Recall: " + str(recall_score(y_test, predicted, average = 'weighted')))
print("F1 Score: " + str(f1_score(y_test, predicted, average = 'weighted')))

Accuracy: 0.6170077133932556
Precision: 0.6190028626563597


  _warn_prf(average, modifier, msg_start, len(result))


Recall: 0.6170077133932556
F1 Score: 0.5903056618496567


## LSTM

This model requires more pre-processing work. We need to set the X sequences to a fixed length so the input matrices will be different from the input matrices used for the earlier models.

### Pre-processing

In [None]:
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Sequential

Prepare the X and Y

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each row.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(cleaned_df['Text'].values)
word_index = tokenizer.word_index #this is important for evaluation of final test dataset

X = tokenizer.texts_to_sequences(cleaned_df['Text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

In [None]:
Y = pd.get_dummies(cleaned_df['Class']).values
print('Shape of label tensor:', Y.shape)

Use the main and excess indexes we have gotten earlier to split X and Y into the main inputs and the excess inputs

In [None]:
X_main = X[main_idxs]
X_excess = X[excess_idxs]
Y_main = Y[main_idxs]
Y_excess = Y[excess_idxs]
print(X_main.shape)
print(X_excess.shape)
print(Y_main.shape)
print(Y_excess.shape)

Like before, only X_main will be split into train and test set

In [None]:
corpus_train_mat = X_main[strat_train_set,] #can still use back the same indices from the earlier stratified train test split
corpus_test_mat = X_main[strat_test_set,]
y_train = Y_main[strat_train_set]
y_test = Y_main[strat_test_set]

### Build LSTM Model

In [None]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(Y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



Early stop the training if the validation loss did not improve in 10 epochs. We will also save the current training model if it has a lower validation loss than the previous saved one. 

In [None]:
es = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='auto')
cp = ModelCheckpoint('before_labels_lstm_model_threshold_20.hdf5',monitor='val_loss', save_best_only = True)

So for the training, we will first fit the excess data (below threshold) to the model first and let the model train on it for 10 epochs. Then, we will fit it to the bulk of the data with the test set.

In [None]:
#excess data train for 10 epochs

model.fit(X_excess, Y_excess, epochs = 10, batch_size = 32)

The validation accuracy is 61%.

In [None]:
epochs = 50
batch_size = 32

history = model.fit(corpus_train_mat, y_train, epochs=epochs, validation_data=(corpus_test_mat, y_test),
                    batch_size=batch_size,callbacks=[cp, es])

### Get F1 Score

Now, we need to get the F1 score. We can't get the F1 score from model training as the f1 score metric is not supported by keras. Hence, we need to get the model to predict on the test set again and calculate the f1 score from there. First, we need to load the best saved LSTM model.

In [None]:
from keras.models import load_model

# returns a compiled model identical to the previous one
model = load_model('before_labels_lstm_model_threshold_20.hdf5')



Get predictions for test set

In [None]:
predictions = model.predict(corpus_test_mat)

The model will return a list of probabilities for each row where each probability corresponds to the probability of that row belonging to this class. So we will take the class with the highest probability as the predicted class for that row. 

In [None]:
import numpy as np

predictions_ans = []
for i in predictions:
  predictions_ans.append(np.argmax(i))

Since we encoded the Y by dummy variables earlier, each row will only have a 1 and the rest are 0. The 1 for that column will tell us the correct class for this row.

In [None]:
correct_ans = []
for i in y_test:
  correct_ans.append(i.tolist().index(1))

Retrieving the F1 score. It is found to be 58%.

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print("Accuracy: " + str(accuracy_score(correct_ans, predictions_ans)))
print("Precision: " + str(precision_score(correct_ans, predictions_ans, average = 'weighted')))
print("Recall: " + str(recall_score(correct_ans, predictions_ans, average = 'weighted')))
print("F1 Score: " + str(f1_score(correct_ans, predictions_ans, average = 'weighted')))

As a summary, here are the accuracies and F1 scores we obtained for our baseline models. 

1. Naive Bayes: accuracy- 40, f1 score- 33

2. Linear SVM: accuracy- 65, f1 score- 64

3. Logistic Regression: accuracy- 61, f1 score- 59

4. LSTM: accuracy- 61, f1 score- 58
