In [1]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv("wine-raitngs.csv")
df.head(3)

Unnamed: 0,name,region,variety,rating,notes
0,1000 Stories Bourbon Barrel Aged Batch Blue Ca...,"Mendocino, California",Red Wine,91,"This is a very special, limited release of 100..."
1,1000 Stories Bourbon Barrel Aged Gold Rush Red...,California,Red Wine,89,The California Gold Rush was a period of coura...
2,1000 Stories Bourbon Barrel Aged Gold Rush Red...,California,Red Wine,90,The California Gold Rush was a period of coura...


In [3]:
df.columns

Index(['name', 'region', 'variety', 'rating', 'notes'], dtype='object')

In [4]:
df.shape

(32980, 5)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32980 entries, 0 to 32979
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     32980 non-null  object
 1   region   32977 non-null  object
 2   variety  32621 non-null  object
 3   rating   32980 non-null  int64 
 4   notes    32980 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.3+ MB


###  **Section A: Data Cleaning**

**1. Identify and remove any duplicate rows and missing values from the global wine quality dataset. <br>Store the cleaned dataset in a new dataframe named non_dm_data.**

In [6]:
# drop duplicates
df.drop_duplicates(inplace= True)
# drop missing values
df.dropna(inplace = True)
# assign df to a new dataframe
non_dm_data = df
non_dm_data.shape

(32402, 5)

**2.	The "Region" column contains both country and city names. <br> Since you need to analyze wine quality by country, ensure that only country names are listed. <br>Replace any cities without a corresponding country name with the relevant country.**

In [7]:
# pick the last string as this is the country name
non_dm_data["region"] = non_dm_data["region"].str.split().str[-1]
non_dm_data.region.unique()

array(['California', 'Spain', 'Washington', 'Chile', 'Australia',
       'Zealand', 'Germany', 'Italy', 'France', 'Portugal', 'Oregon',
       'Argentina', 'Greece', 'Africa', 'Austria', 'U.S.', 'China',
       'Israel', 'Uruguay', 'England', 'Mexico', 'Lebanon', 'Hungary',
       'Switzerland', 'Canada', 'Slovenia', 'Turkey'], dtype=object)

In [8]:
# some of the rows have just cities, replace this with corresponding country
non_dm_data.replace({"California": "U.S.", 
                                "Washington": "U.S.", 
                                "Oregon": "U.S.",
                                "Africa": "South Africa",
                                "Zealand": "New Zealand"}, inplace = True)

In [9]:
non_dm_data.region.sort_values().unique()

array(['Argentina', 'Australia', 'Austria', 'Canada', 'Chile', 'China',
       'England', 'France', 'Germany', 'Greece', 'Hungary', 'Israel',
       'Italy', 'Lebanon', 'Mexico', 'New Zealand', 'Portugal',
       'Slovenia', 'South Africa', 'Spain', 'Switzerland', 'Turkey',
       'U.S.', 'Uruguay'], dtype=object)

**3. Create a table showing the percentage distribution of wine varieties, rounded to one decimal place.**

In [10]:
(non_dm_data["variety"].value_counts(normalize= True) * 100).round(1)

variety
Red Wine                 72.0
White Wine               23.0
Sparkling & Champagne     2.6
Pink and Rosé             1.5
Collectible               0.7
Green Wine                0.1
Boutique                  0.1
Screw Cap                 0.0
Name: proportion, dtype: float64

**4. Based on wine ratings, classify the wines into the following categories:**
- 'Good' for ratings between 85-89
- 'Outstanding' for ratings between 90-94
- 'Exceptional' for ratings between 95-100 <br>
**Create a new column named "wine_quality" to store these categories.**


In [11]:
conditions = [non_dm_data.rating >= 95, (non_dm_data.rating >= 90) & (non_dm_data.rating <= 94)]

choices = ["Exceptional", "Outstanding"]

non_dm_data["wine_quality"] = np.select(conditions, choices, default = "Good")

### **Section B: Text Preprocessing**

**Scenario**: You are a data scientist working with text data containing noise, making it difficult to analyze. <br>The "notes" column in your dataset has inconsistencies such as mixed cases, punctuation, numbers, and irrelevant words. <br>Using your NLP expertise, you need to clean and preprocess the text. <br>
**Tasks:**

**1.	Convert all text to lowercase for uniformity.**

In [12]:
non_dm_data["notes"] = non_dm_data["notes"].str.lower()

**2.	Remove numbers that do not contribute to meaningful analysis.**

In [13]:
non_dm_data["notes"] = non_dm_data["notes"].str.replace(r'\d+', " ", regex = True)
non_dm_data["notes"]

0        this is a very special, limited release of   s...
1        the california gold rush was a period of coura...
2        the california gold rush was a period of coura...
3        the wine has a deep, rich purple color. an int...
4        batch #  is the first release of the   vintage...
                               ...                        
32775    now   years old, alec's younger brother ethan ...
32776    born in  , everything about ethan and his new ...
32777    positioned between brothers, alec and mason, e...
32778             blend:  % cabernet sauvignon,  % merlot 
32779              blend:  % cabernet sauvignon,  % merlot
Name: notes, Length: 32402, dtype: object

**3.	Eliminate punctuation marks to simplify the text.**

In [14]:
non_dm_data["notes"] = non_dm_data["notes"].str.translate(str.maketrans("", "", string.punctuation))
non_dm_data["notes"]

0        this is a very special limited release of   st...
1        the california gold rush was a period of coura...
2        the california gold rush was a period of coura...
3        the wine has a deep rich purple color an inten...
4        batch   is the first release of the   vintage ...
                               ...                        
32775    now   years old alecs younger brother ethan sh...
32776    born in   everything about ethan and his new  ...
32777    positioned between brothers alec and mason eth...
32778                 blend   cabernet sauvignon   merlot 
32779                  blend   cabernet sauvignon   merlot
Name: notes, Length: 32402, dtype: object

**4.	Lemmatize the text using POS tagging to reduce words to their root forms while preserving context.**

In [15]:
# function to convert POS tag to WordNet POS tag
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN     #default to noun if no match

In [16]:
# function to lemmatize text
def lemmatize_text(text):
    # tokenize the text
    tokens = nltk.word_tokenize(text)  
    # Get POS tags for the tokens
    pos_tags = nltk.pos_tag(tokens)
    # Initialize the lemmatizer
    lemmatizer = WordNetLemmatizer()
    # Lemmatize each word with its POS tag
    lemmatized_words = []

    for word, tag in pos_tags:
        wordnet_pos = get_wordnet_pos(tag)
        lemmatized_word = lemmatizer.lemmatize(word, pos = wordnet_pos)
        lemmatized_words.append(lemmatized_word)
    # return the lemmatized words joined as a string
    return " ".join(lemmatized_words)

In [17]:
# apply lemmatization to the notes
non_dm_data["notes"] = non_dm_data["notes"].apply(lemmatize_text)
non_dm_data["notes"]

0        this be a very special limited release of stor...
1        the california gold rush be a period of courag...
2        the california gold rush be a period of courag...
3        the wine have a deep rich purple color an inte...
4        batch be the first release of the vintage and ...
                               ...                        
32775    now year old alecs young brother ethan show su...
32776    bear in everything about ethan and his new syr...
32777    position between brother alec and mason ethan ...
32778                      blend cabernet sauvignon merlot
32779                      blend cabernet sauvignon merlot
Name: notes, Length: 32402, dtype: object

**5.	Remove stopwords (e.g., "the", "and") to focus on relevant terms.** <br>
Create a new column  "description_token"  to store the cleaned text as tokens.

In [18]:
# define the stop words set
stop_words = set(stopwords.words("english"))

In [19]:
# function to remove stopwords
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stop_words])

In [20]:
# apply function to notes column
non_dm_data["description_token"] = non_dm_data["notes"].apply(remove_stopwords)
non_dm_data["description_token"].head()

0    special limited release story bourbon barrelag...
1    california gold rush period courage bravado cu...
2    california gold rush period courage bravado cu...
3    wine deep rich purple color intense raspberry ...
4    batch first release vintage againmendocino zin...
Name: description_token, dtype: object

### **Section C: Analysis**
Use the cleaned data from section A and B to answer the following questions.

**1.	Identify the top 5 words most strongly associated with wine quality (good, outstanding and exceptional). <br>For each wine quality group, display top 5 words.**

In [21]:
non_dm_data.columns

Index(['name', 'region', 'variety', 'rating', 'notes', 'wine_quality',
       'description_token'],
      dtype='object')

In [22]:
# group data by wine quality and join all description token in each category
grouped_description = non_dm_data.groupby("wine_quality")["description_token"].apply(lambda x: " ".join(x)).reset_index()

# initialize TfidfVectorizer
tfidf = TfidfVectorizer()

# apply TF-IDF to the decription_token column
tfidf_matrix = tfidf.fit_transform(grouped_description["description_token"])

# convert TF-IDF matrix to a dataframe
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out(), index = grouped_description["wine_quality"])

# get the top 5 moost associated words for each quality
top_n = 5
top_words = {}

for wine_quality in tfidf_df.index:
    # get the top N words with the highest TF-IDF score for each quality
    sorted_words = tfidf_df.loc[wine_quality].sort_values(ascending = False).head(top_n)
    top_words[wine_quality] = sorted_words.index.tolist()

top_words

{'Exceptional': ['wine', 'fruit', 'tannin', 'black', 'palate'],
 'Good': ['wine', 'fruit', 'flavor', 'aroma', 'finish'],
 'Outstanding': ['wine', 'fruit', 'aroma', 'finish', 'palate']}

**2.	Display the top 5 regions producing the highest number of Exceptional quality wines.**

In [23]:
exceptional = non_dm_data[non_dm_data["wine_quality"] == "Exceptional"]
exceptional.groupby("region")["region"].count().nlargest(5)

region
France       994
U.S.         882
Italy        269
Australia    124
Spain        108
Name: region, dtype: int64

**3.	Identify and display the regions with the highest diversity of wine varieties.**

In [24]:
non_dm_data.groupby("region")["variety"].count().nlargest(5)

region
U.S.         13394
France        7864
Italy         4067
Spain         1778
Australia     1652
Name: variety, dtype: int64