In [61]:
import pandas as pd
import numpy as np
from pathlib import Path
import re


import nltk
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet 

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS



# Download NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tylernardone/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tylernardone/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tylernardone/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/tylernardone/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [62]:
# Read in raw data
BASE_DIR = Path().resolve().parent
DATA_DIR = BASE_DIR / "data"

In [63]:
df = pd.read_csv(DATA_DIR / "intermediate" / "25072024_reviews_openrefine.csv")

In [110]:
lemmatizer = WordNetLemmatizer()


def preprocess_text_with_lemmatization(text):
    # Remove non-alphanumeric characters
    text = re.sub(r'\W', ' ', text)
    # Tokenize and convert to lowercase
    tokens = word_tokenize(text.lower())
    # POS tagging
    pos_tags = pos_tag(tokens)

    # Remove adverbs, verbs, and stopwords from pos_tags
    pos_tags_filtered = [(word, pos) for word, pos in pos_tags if not (pos.startswith('V') or pos.startswith('R'))
                                                                       and word not in ENGLISH_STOP_WORDS]
    
    # Lemmatize words based on POS tags
    lemmatized_words = [
        lemmatizer.lemmatize(word, get_wordnet_tag(pos) or 'n')  # Default to noun if no tag
        for word, pos in pos_tags_filtered]
    return ' '.join(lemmatized_words)


def get_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

In [111]:
df['cleaned_assessment'] = df['blind_assessment'].apply(preprocess_text_with_lemmatization)

In [112]:
df[['cleaned_assessment', 'blind_assessment']].head(10)

Unnamed: 0,cleaned_assessment,blind_assessment
0,rich intricate layered lemon zest cacao nib vi...,"Rich, intricate and layered. Lemon zest, roast..."
1,sweet tart crisply herbaceous chocolate green ...,"Gently sweet-tart, crisply herbaceous. Baking ..."
2,floral tropical leaning magnolia green banana ...,"Floral-toned, tropical-leaning. Magnolia, guav..."
3,floral complex flower lilac cacoa nib tangerin...,"Crisply floral, delicately lively. Complex flo..."
4,espresso complex dark chocolate molasses narci...,"Evaluated as espresso. Intrigungly complex, ba..."
5,tart crisply fruity dark chocolate magnolia ar...,"Tart-leaning, crisply fruity. Pomegranate, dar..."
6,gentle sweet coffee balance understated depth ...,"A gentle, sweet coffee whose balance and under..."
7,sweet savory chocolaty spicy chocolate plum ro...,"Sweet/savory, chocolaty. Spicy chocolate, plum..."
8,delicate bright juicy precious ripe lemon oran...,"Delicate, bright, juicy, precious. Ripe lemon,..."
9,lush overripe harrar fruit note salty bitter t...,"Lush, sweetly overripe Harrar fruit notes are ..."


In [115]:
vectorizer = TfidfVectorizer(max_features=200)
tfidf_matrix = vectorizer.fit_transform(df['cleaned_assessment'])
keywords = vectorizer.get_feature_names_out()

In [116]:
print(keywords)

['acidity' 'acidy' 'almond' 'apple' 'apricot' 'aroma' 'aromatic'
 'aromatics' 'astringency' 'astringent' 'baker' 'balance' 'balanced'
 'banana' 'bergamot' 'berry' 'big' 'bit' 'bitter' 'bittersweet' 'black'
 'blackberry' 'blossom' 'blueberry' 'body' 'brandy' 'bright' 'brisk'
 'brittle' 'brown' 'buoyant' 'butter' 'buttery' 'cacao' 'caramel' 'carry'
 'cashew' 'cedar' 'center' 'character' 'cherry' 'chocolate' 'chocolaty'
 'cinnamon' 'citrus' 'citrusy' 'clean' 'cocoa' 'coffee' 'complex'
 'complexity' 'consolidates' 'continued' 'creamy' 'crisp' 'crisply' 'cup'
 'currant' 'cut' 'dark' 'date' 'deep' 'deeply' 'delicate' 'distinct'
 'dried' 'dry' 'earth' 'edge' 'espresso' 'fine' 'finish' 'fir' 'flavor'
 'floral' 'flower' 'freesia' 'fresh' 'fruit' 'fruity' 'fudge' 'gardenia'
 'gentle' 'grape' 'grapefruit' 'green' 'guava' 'hazelnut' 'heavy' 'herb'
 'high' 'hint' 'honey' 'honeysuckle' 'intense' 'jasmine' 'juicy' 'ken'
 'lavender' 'lead' 'lean' 'lemon' 'light' 'like' 'lilac' 'lime' 'long'
 'low' 'lu

In [59]:
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=keywords)


In [60]:
tfidf_df

Unnamed: 0,acidity,almond,apple,apricot,aroma,aromatic,bake,baker,balance,balanced,...,toned,turn,undertone,vanilla,velvety,verbena,vibrant,wine,wood,zest
0,0.117075,0.000000,0.0,0.000000,0.095316,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.249149
1,0.072021,0.000000,0.0,0.000000,0.058635,0.0,0.197761,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.468924,0.000000,0.000000,0.0,0.000000
2,0.125894,0.000000,0.0,0.000000,0.102495,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.331839,0.000000,0.000000,0.000000,0.0,0.000000
3,0.088734,0.000000,0.0,0.000000,0.072242,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
4,0.000000,0.000000,0.0,0.000000,0.064100,0.0,0.000000,0.0,0.147886,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7558,0.087311,0.000000,0.0,0.000000,0.071083,0.0,0.000000,0.0,0.163996,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.244272,0.000000,0.0,0.000000
7559,0.103188,0.000000,0.0,0.264733,0.084010,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.219596
7560,0.102313,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.192176,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.286245,0.000000,0.0,0.000000
7561,0.086661,0.000000,0.0,0.000000,0.070554,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.259586,0.000000,0.000000,0.000000,0.544986,0.0,0.000000


In [29]:
lemmatizer.lemmatize("running", pos="v")

'run'