In [1]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import os
import re
from queue import PriorityQueue
from collections import defaultdict
import sys
import json
import pickle
from tqdm import tqdm

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

from service.lyrics_classifier import Classifier

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/timicienio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
class DualPriorityQueue(PriorityQueue):
    def __init__(self, maxPQ=False):
        PriorityQueue.__init__(self)
        self.reverse = -1 if maxPQ else 1

    def put(self, priority, data):
        PriorityQueue.put(self, (self.reverse * priority, data))

    def get(self, *args, **kwargs):
        priority, data = PriorityQueue.get(self, *args, **kwargs)
        return self.reverse * priority, data

In [3]:
features = ['family', 'subfamily', 'ingredients', 'gender']

## Combine Data

In [4]:
feature = features[2]

directory = f'../data/{feature}'


# List to store all the DataFrames
dataframes = []

# Iterate over all files in the directory
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)

    # Check if the file is a CSV file
    if filename.endswith(".csv"):
        # Read the CSV file and append it to the list of DataFrames
        df = pd.read_csv(file_path)
        dataframes.append(df)

        feature_value = os.path.splitext(filename)[0]
        df[feature] = feature_value

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)


# Print the first few rows to verify
combined_df

Unnamed: 0,name,description,ingredients
0,Wallstreet,"A modern aroma, ideal for urban men and that h...",Lemon
1,L'Eau D'Issey Shade Of Sunrise 2019,"Day 1, 5:45 am. "" evokes a joyous, exciting an...",Lemon
2,Opus 1144,"Inspired by Gothicism (circa 1144), Opus 1144 ...",Lemon
3,Blu Di Roma Uomo,"Blu di Roma, a romantic, intense, and fresh fr...",Lemon
4,Rem L'Acqua,Celebrate 20 years of Rem with Rem L'Acqua! Im...,Lemon
...,...,...,...
132842,Al Abiq Oud,The addition of precious woods in the base ele...,Oud - Agarwood
132843,Inara Oud,Inara Oud was inspired by the woman whose radi...,Oud - Agarwood
132844,Ganga,"Inspired by the majestic rivers of India, Gang...",Oud - Agarwood
132845,Rosamunda,Laboratorio Olfattivo has taken up the challen...,Oud - Agarwood


## Drop Empty Rows

In [5]:
combined_df = combined_df.dropna()

## Split Data

In [6]:
train_origin, test_origin = train_test_split(
    combined_df, test_size=0.01, stratify=combined_df[feature]
)

In [7]:
train = train_origin.reset_index(drop=True)
test = test_origin.reset_index(drop=True)
train["index"] = train.index
test["index"] = test.index

## Oversampling

In [8]:
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=27)
train = oversampler.fit_resample(train, train[feature])[0]
train

Unnamed: 0,name,description,ingredients,index
0,I Am Extrait,"Her sensual, tender sillage charms and captiva...",Amber,0
1,Voile De Jasmin,Bvlgari Voile de Jasmin is a delicate ovocatio...,Musk,1
2,Overdose Aphrodisiaque,"Thirsty for independence, the anti-conformist ...",Incense - Olibanum,2
3,Palais Bourbon,"A fragrance of contrast and cooperation, Palai...",Benzoin,3
4,Black Onyx,Black Onyx is a refreshing fragrance shrouded ...,Oud - Agarwood,4
...,...,...,...,...
363565,Skye,A distinctive bouquet made from a stunning ble...,Ylang-ylang,58118
363566,Sincere,This scent is an enrobing expression of profou...,Ylang-ylang,95194
363567,Vol 870 Yul-Cdg,This perfume expresses the beginning of a beau...,Ylang-ylang,39032
363568,Eau De Lune,Laura Mercier's Eau de Lune offers a romantic ...,Ylang-ylang,21106


## Tokenize

In [9]:
nltk.download("stopwords")
stopwords = nltk.corpus.stopwords.words("english")

def text_split(text):
    text = re.split("[^a-zA-Z]+", text)
    text = [x for x in text if x]
    return text


def tokenize(text):
    # turn into lower case
    text = text.lower()
    # tokenize
    words = text_split(text)

    # words =  [''.join(filter(str.isalnum, word)) for word in words]
    words = [word for word in words if word != ""]

    # PorterStemmer algorithm
    words = [nltk.stem.PorterStemmer().stem(word) for word in words]

    # remove stopwords
    words = [word for word in words if not word in stopwords]
    return words

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/timicienio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
train["tokenized_description"] = train["description"].apply(lambda x: tokenize(x))

In [11]:
pickle.dump(train, open(f"../dump/train_{feature}.pkl", "wb"))
pickle.dump(
    test,
    open(f"../dump/test_{feature}.pkl", "wb"),
)

In [12]:
train = pickle.load(
    open(f"../dump/train_{feature}.pkl", "rb")
)
test = pickle.load(
    open(f"../dump/test_{feature}.pkl", "rb")
)

## Get Frequencies

In [13]:
def get_vocabulary(tokens):
    return set([t for d in tokens for t in d])


def get_term_frequency(tokens):
    termFrequency = {}
    for t in tokens:
        termFrequency[t] = termFrequency.get(t, 0) + 1
    return termFrequency


def get_document_frequencies(termFrequencies):
    documentFrequencies = {}
    for documentTerms in termFrequencies:
        for term in documentTerms:
            documentFrequencies[term] = documentFrequencies.get(term, 0) + 1
    return documentFrequencies


def get_document_term_presence(documentTokens):
    return [set(tokens) for tokens in documentTokens]

In [14]:
vocabulary = get_vocabulary(train["tokenized_description"])
document_term_frequencies = list(
    map(get_term_frequency, train["tokenized_description"])
)
document_frequencies = get_document_frequencies(document_term_frequencies)
document_presence = get_document_term_presence(train["tokenized_description"])
collection_dictionary = [(t, f) for t, f in sorted(document_frequencies.items())]
len(vocabulary)

17641

## Classes

In [15]:
classes = list(set(train[feature].values))
class_frequencies = train[feature].value_counts()

num_documents = len(train[feature])
classes

['Bergamot',
 'Violet',
 'Geranium',
 'Pink Pepper',
 'Benzoin',
 'Grapefruit',
 'Mandarin',
 'Tonka Bean',
 'Lily Of The Valley',
 'Musk',
 'Vetiver',
 'Leather',
 'Amber',
 'Neroli',
 'Incense - Olibanum',
 'Sandalwood',
 'Patchouli',
 'Oud - Agarwood',
 'Lemon',
 'Ciste Labdanum',
 'Orange Blossom',
 'Cedarwood',
 'Jasmine',
 'Vanilla',
 'Rose',
 'Ylang-ylang',
 'Lavender',
 'Cardamom',
 'Woody Notes',
 'Iris - Orris']

## Feature Selection

In [16]:
# # Calculate
# def expected_freq(term):
#     document_frequency = len(
#         [
#             documentId
#             for documentId in train['index']
#             if term in document_presence[documentId]
#         ]
#     )
#     return [
#         document_frequency * f / num_documents for f in class_frequencies
#     ], document_frequency


# def observed_freq(term):
#     return [
#         len(train.loc[(train[feature] == class_name) & (term in train["tokenized_description"])])
#         for class_name in classes
#     ]


# def chi2(term):
#     es, document_frequency = expected_freq(term)
#     ns = observed_freq(term)
#     if document_frequency == 0:
#         return 0
#     value = sum([((n - e) ** 2) / e for (n, e) in zip(ns, es)])
#     return value


# chi2s = [(chi2(term), term) for term in tqdm(vocabulary)]

# chi2s_pq = DualPriorityQueue(maxPQ=True)
# [chi2s_pq.put(chi2, term) for (chi2, term) in chi2s]

# chi2_sorted_terms = []
# while not chi2s_pq.empty():
#     chi2_sorted_terms.append(chi2s_pq.get())

# json.dump(chi2_sorted_terms, open(f"../dump/chi2_sorted_terms_{feature}.json", "w+"))

In [17]:
# Load from dump
chi2_sorted_terms = json.load(open(f"../dump/chi2_sorted_terms_{features[0]}.json", "r"))

## Train

In [18]:
def calculate_class_p(class_name, selected_terms):
    class_documents = train.loc[(train[feature] == class_name)]
    prior = len(class_documents) / num_documents
    class_document_tokens = []

    for document in class_documents["tokenized_description"]:
        tokens = document
        for token in tokens:
            if token in selected_terms:
                class_document_tokens.append(token)
    class_term_frequencies = {}
    for token in class_document_tokens:
        class_term_frequencies[token] = class_term_frequencies.get(token, 0) + 1
    token_count_sum = len(class_document_tokens) + len(selected_terms)
    cond_prob = {
        term: ((class_term_frequencies.get(term, 0) + 1) / token_count_sum) # smoothing
        for term in selected_terms
    }
    return {"prior": prior, "cond_prob": cond_prob}

def calculate_classes_p(selected_terms):
  class_p = {
      class_name: calculate_class_p(class_name, selected_terms)
      for class_name in classes
  }

  return class_p

In [19]:
top_k_options = [100, 200, 400, 1000, 2000, 4000, 10000]
for i in top_k_options:
    selected_terms = set([term for _, term in chi2_sorted_terms[:i]])
    classes_p = calculate_classes_p(selected_terms)
    json.dump({"vocabulary": list(selected_terms), "p": classes_p}, open(f"../dump/class_p_{feature}_top_{i}.json", "w+"))

## Evaluation

In [20]:
from collections import Counter


def evaluate_classifier(test_documents, feature, multi_label=False, top_K=None):
    y_true = []
    y_pred = []

    classifier = Classifier(feature, multi_label, top_K)

    # Iterate through the test documents
    for document, true_label in test_documents:
        # Get the predicted class (or classes) for the document
        predicted_label = classifier.classification(document)

        # If multi_label is True and classifier returns multiple labels
        if multi_label:
            # If predicted_label is a list or set, handle as multi-label classification
            if isinstance(predicted_label, (list, set)):
                y_pred.append(predicted_label)  # Multiple predicted labels
                y_true.append([true_label])  # Single true label wrapped in a list
            else:
                y_pred.append([predicted_label])  # Single predicted label
                y_true.append([true_label])  # Single true label
        else:
            # For single class prediction
            y_pred.append([predicted_label])
            y_true.append([true_label])

    # Calculate evaluation metrics
    precision, recall, f1 = calculate_precision_recall_f1(y_true, y_pred)

    return precision, recall, f1



# Helper function to calculate precision, recall, and F1 for each class (weighted average)
def calculate_precision_recall_f1(y_true, y_pred):
    # Initialize counters for true positives, false positives, and false negatives
    true_positives = Counter()
    false_positives = Counter()
    false_negatives = Counter()
    label_counts = Counter()

    # Collect counts for precision and recall calculation
    for true_labels, pred_labels in zip(y_true, y_pred):
        for label in true_labels:
            label_counts[label] += 1
        for label in pred_labels:
            if label in true_labels:
                true_positives[label] += 1
            else:
                false_positives[label] += 1
        for label in true_labels:
            if label not in pred_labels:
                false_negatives[label] += 1

    # Calculate precision, recall, and F1 for each label
    precision = {}
    recall = {}
    f1 = {}

    for label in label_counts:
        tp = true_positives[label]
        fp = false_positives[label]
        fn = false_negatives[label]
        total = label_counts[label]

        # Calculate precision, recall, and F1 for this label
        precision[label] = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall[label] = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1[label] = (
            2 * (precision[label] * recall[label]) / (precision[label] + recall[label])
            if (precision[label] + recall[label]) > 0
            else 0
        )

    # Compute weighted averages based on the frequency of labels in y_true
    total_labels = sum(label_counts.values())

    weighted_precision = sum(
        precision[label] * label_counts[label] / total_labels for label in precision
    )
    weighted_recall = sum(
        recall[label] * label_counts[label] / total_labels for label in recall
    )
    weighted_f1 = sum(f1[label] * label_counts[label] / total_labels for label in f1)

    return weighted_precision, weighted_recall, weighted_f1

In [22]:
feature_is_multilabel = {
    'family': False,
    'subfamily': False,
    'ingredients': True,
    'gender': False
}

for feature in features:
  print(f"======= Results for {feature} =======")
  train = pickle.load(
    open(f"../dump/train_{feature}.pkl", "rb")
  )
  test = pickle.load(
      open(f"../dump/test_{feature}.pkl", "rb")
  )
  for k in top_k_options:
      print(f'=== Results for top {k} features === ')
      precision, recall, f1 = evaluate_classifier(
          zip(test["description"], test[feature]),
          feature,
          multi_label=feature_is_multilabel[feature],
          top_K=k,
      )
      print(f"Precision: {precision:.5}")
      print(f"Recall: {recall:.5}")
      print(f"F1-Score: {f1:.5}")

=== Results for top 100 features === 
Precision: 0.43379
Recall: 0.26236
F1-Score: 0.28845
=== Results for top 200 features === 
Precision: 0.43266
Recall: 0.28517
F1-Score: 0.30094
=== Results for top 400 features === 
Precision: 0.4539
Recall: 0.32319
F1-Score: 0.33502
=== Results for top 1000 features === 
Precision: 0.45776
Recall: 0.3384
F1-Score: 0.3528
=== Results for top 2000 features === 
Precision: 0.44994
Recall: 0.3384
F1-Score: 0.3553
=== Results for top 4000 features === 
Precision: 0.46361
Recall: 0.39544
F1-Score: 0.41061
=== Results for top 10000 features === 
Precision: 0.46661
Recall: 0.42205
F1-Score: 0.43086
=== Results for top 100 features === 
Precision: 0.42782
Recall: 0.14068
F1-Score: 0.16112
=== Results for top 200 features === 
Precision: 0.48716
Recall: 0.1597
F1-Score: 0.18405
=== Results for top 400 features === 
Precision: 0.38817
Recall: 0.1711
F1-Score: 0.18384
=== Results for top 1000 features === 
Precision: 0.37424
Recall: 0.21673
F1-Score: 0.23562
