In [2]:
import re
from pathlib import Path

import pandas as pd
import numpy as np

import nltk
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS

# Download NLTK data
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tylernardone/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tylernardone/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tylernardone/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/tylernardone/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:
# Read in raw data
BASE_DIR = Path().resolve().parent
DATA_DIR = BASE_DIR / "data"

In [7]:
df = pd.read_csv(DATA_DIR / "intermediate" / "25072024_reviews_openrefine.csv")

In [8]:
lemmatizer = WordNetLemmatizer()


def preprocess_text_with_lemmatization(text):
    # Remove non-alphanumeric characters
    text = re.sub(r"\W", " ", text)
    # Tokenize and convert to lowercase
    tokens = word_tokenize(text.lower())
    # POS tagging
    pos_tags = pos_tag(tokens)

    # Remove adverbs, verbs, and stopwords from pos_tags
    pos_tags_filtered = [
        (word, pos)
        for word, pos in pos_tags
        if not (pos.startswith("V") or pos.startswith("R"))
        and word not in ENGLISH_STOP_WORDS
    ]

    # Lemmatize words based on POS tags
    lemmatized_words = [
        lemmatizer.lemmatize(
            word, get_wordnet_tag(pos) or "n"
        )  # Default to noun if no tag
        for word, pos in pos_tags_filtered
    ]
    return " ".join(lemmatized_words)


def get_wordnet_tag(nltk_tag):
    if nltk_tag.startswith("J"):
        return wordnet.ADJ
    elif nltk_tag.startswith("V"):
        return wordnet.VERB
    elif nltk_tag.startswith("N"):
        return wordnet.NOUN
    elif nltk_tag.startswith("R"):
        return wordnet.ADV
    else:
        return None

In [9]:
df["cleaned_assessment"] = df["blind_assessment"].apply(
    preprocess_text_with_lemmatization
)

In [10]:
df[["cleaned_assessment", "blind_assessment"]].head(10)

Unnamed: 0,cleaned_assessment,blind_assessment
0,rich intricate layered lemon zest cacao nib vi...,"Rich, intricate and layered. Lemon zest, roast..."
1,sweet tart crisply herbaceous chocolate green ...,"Gently sweet-tart, crisply herbaceous. Baking ..."
2,floral tropical leaning magnolia green banana ...,"Floral-toned, tropical-leaning. Magnolia, guav..."
3,floral complex flower lilac cacoa nib tangerin...,"Crisply floral, delicately lively. Complex flo..."
4,espresso complex dark chocolate molasses narci...,"Evaluated as espresso. Intrigungly complex, ba..."
5,tart crisply fruity dark chocolate magnolia ar...,"Tart-leaning, crisply fruity. Pomegranate, dar..."
6,gentle sweet coffee balance understated depth ...,"A gentle, sweet coffee whose balance and under..."
7,sweet savory chocolaty spicy chocolate plum ro...,"Sweet/savory, chocolaty. Spicy chocolate, plum..."
8,delicate bright juicy precious ripe lemon oran...,"Delicate, bright, juicy, precious. Ripe lemon,..."
9,lush overripe harrar fruit note salty bitter t...,"Lush, sweetly overripe Harrar fruit notes are ..."


In [11]:
vectorizer = TfidfVectorizer(max_features=200)
tfidf_matrix = vectorizer.fit_transform(df["cleaned_assessment"])
keywords = vectorizer.get_feature_names_out()

In [12]:
print(keywords)

['acidity' 'acidy' 'almond' 'apple' 'apricot' 'aroma' 'aromatic'
 'aromatics' 'astringency' 'astringent' 'baker' 'balance' 'balanced'
 'banana' 'bergamot' 'berry' 'big' 'bit' 'bitter' 'bittersweet' 'black'
 'blackberry' 'blossom' 'blueberry' 'body' 'brandy' 'bright' 'brisk'
 'brittle' 'brown' 'buoyant' 'butter' 'buttery' 'cacao' 'caramel' 'carry'
 'cashew' 'cedar' 'center' 'character' 'cherry' 'chocolate' 'chocolaty'
 'cinnamon' 'citrus' 'citrusy' 'clean' 'cocoa' 'coffee' 'complex'
 'complexity' 'consolidates' 'continued' 'creamy' 'crisp' 'crisply' 'cup'
 'currant' 'cut' 'dark' 'date' 'deep' 'deeply' 'delicate' 'distinct'
 'dried' 'dry' 'earth' 'edge' 'espresso' 'fine' 'finish' 'fir' 'flavor'
 'floral' 'flower' 'freesia' 'fresh' 'fruit' 'fruity' 'fudge' 'gardenia'
 'gentle' 'grape' 'grapefruit' 'green' 'guava' 'hazelnut' 'heavy' 'herb'
 'high' 'hint' 'honey' 'honeysuckle' 'intense' 'jasmine' 'juicy' 'ken'
 'lavender' 'lead' 'lean' 'lemon' 'light' 'like' 'lilac' 'lime' 'long'
 'low' 'lu

In [13]:
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=keywords)

In [14]:
tfidf_df

Unnamed: 0,acidity,acidy,almond,apple,apricot,aroma,aromatic,aromatics,astringency,astringent,...,velvety,verbena,vibrant,violet,viscous,walnut,wine,wisteria,wood,zest
0,0.125244,0.0,0.00000,0.0,0.000000,0.103540,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.454904,0.0,0.0,0.000000,0.0,0.0,0.267165
1,0.072179,0.0,0.00000,0.0,0.000000,0.059672,0.0,0.0,0.0,0.0,...,0.000000,0.470808,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
2,0.099742,0.0,0.00000,0.0,0.000000,0.082458,0.0,0.0,0.0,0.0,...,0.264907,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
3,0.094329,0.0,0.00000,0.0,0.000000,0.077983,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
4,0.000000,0.0,0.00000,0.0,0.000000,0.069702,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7558,0.074009,0.0,0.00000,0.0,0.000000,0.061184,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.207403,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
7559,0.105952,0.0,0.00000,0.0,0.277968,0.087592,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.226014
7560,0.091892,0.0,0.00000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.257517,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
7561,0.075798,0.0,0.00000,0.0,0.000000,0.062663,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.484703,0.0,0.0,0.000000


In [16]:
df = pd.concat([df, tfidf_df], axis=1)

In [17]:
df

Unnamed: 0,rating,roaster,title,blind_assessment,notes,bottom_line,roaster_location,located in the administrative territorial entity2,territorial_entity_2,country,...,velvety,verbena,vibrant,violet,viscous,walnut,wine,wisteria,wood,zest
0,92,Red Rooster Coffee Roaster,Ethiopia Sidama Shoye,"Rich, intricate and layered. Lemon zest, roast...",Produced by family-owned farms that are part o...,"An elegant washed Sidamo cup, both deeply swee...",Floyd,Floyd County,Virginia,United States of America,...,0.000000,0.000000,0.000000,0.454904,0.0,0.0,0.000000,0.0,0.0,0.267165
1,92,El Gran Cafe,Finca Santa Elisa Geisha,"Gently sweet-tart, crisply herbaceous. Baking ...",Produced by Finca Santa Elisa entirely of the ...,"A confident, pretty washed-process Guatemala G...",Antigua Guatemala,Sacatepéquez Department,,Guatemala,...,0.000000,0.470808,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
2,93,Tipico Coffee,Costa Rica Sin Limites Gesha,"Floral-toned, tropical-leaning. Magnolia, guav...",Produced by Jamie Cardenas of Finca Sin Limite...,A delicate honey-processed Costa Rica Gesha th...,Buffalo,Erie County,New York,United States of America,...,0.264907,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
3,91,Roast House,Ride the Edge,"Crisply floral, delicately lively. Complex flo...",The coffees in this blend are certified organi...,,Spokane,Spokane County,Washington,United States of America,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
4,92,Level Ground Trading,Direct Fair Trade Espresso,"Evaluated as espresso. Intrigungly complex, ba...",Coffees in this blend are all fully wet-proces...,A solid espresso blend equally pleasing as a s...,Victoria,Capital Regional District,,Canada,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7558,94,JBC Coffee Roasters,Tano Batak Sumatra,"Rich-toned, deeply and sweetly earthy. Chocola...",This coffee was grown by indigenous Batak peop...,A multi-layered Sumatra cup with berry and tro...,Madison,Dane County,Wisconsin,United States of America,...,0.000000,0.000000,0.207403,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
7559,96,Big Shoulders Coffee,Panama Hacienda La Esmeralda Gesha,"Complex, floral- and citrus-toned. Lilac, coco...",Coffee from trees of the botanical variety Gei...,A classic washed Geisha from the celebrated Ha...,Chicago,Cook County,Illinois,United States of America,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.226014
7560,92,Lexington Coffee Roasters,Papua New Guinea Kimel,"Balanced, engaging depth, quiet complexity. Ra...",Kimel Plantation is owned and operated by the ...,,Lexington,Virginia,United States of America,United States of America,...,0.000000,0.000000,0.257517,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
7561,87,Green Mountain Coffee,Newman’s Own Organics Special Decaf (K-Cup),(As brewed in a Keurig B60 single-serve brewin...,Sales of this coffee help support a variety of...,,Waterbury,Washington County,Vermont,United States of America,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.484703,0.0,0.0,0.000000
