In [203]:
# Libraries
import pandas as pd
import os
from ast import literal_eval

In [204]:
# Function for saving files

def unnest_dataset(df:pd.DataFrame, target_col:str, id_col:str = 'WineID') -> pd.DataFrame:
  '''
  Unnests list-value columns and creating a separate df
  '''

  new_data = df[[target_col, id_col]]
  new_data = new_data.explode(df)
  return new_data


# Function for creating acronyms

def create_acronym(name:str) -> pd.DataFrame:
    """
    Creates an acronym from the given name. The acronym is formed by taking
    the first letter of each word in the name.
    """
    words = name.split()
    acronym = '_'.join(word[0] + word[-1] for word in words)
    return acronym

Step 1

In [205]:
# Reading data

# XWines_Slim_1K_wines.csv
wines = pd.read_csv('XWines_Slim_1K_wines.csv',

                      dtype = {'WineID': int, 'WineName': str,
                               'Type': str, 'Elaborate':str,
                               'ABV': float, 'Body': str,
                               'Acidity':str, 'Code': str,
                               'Country': str, 'RegionID': int,
                               'RegionName': str, 'WineryID': int,
                               'Wineryname': str, 'Website': str},

                      converters = {'Grapes': literal_eval,
                                    'Harmonize':literal_eval,
                                    'Vintages': literal_eval})

# XWines_Slim_150K_ratings dataset
ratings = pd.read_csv('XWines_Slim_150K_ratings.csv', sep = ',', parse_dates=['Date'])

  ratings = pd.read_csv('XWines_Slim_150K_ratings.csv', sep = ',', parse_dates=['Date'])


Step 2

In [206]:
# Edit data type
ratings['WineID'] = ratings['WineID'].astype(int)

# Delete redundant columns
redund_cols = ['Code', 'Grapes', 'WineryID', 'Website', 'Vintages']

# Create cleaned_wines frame
if any(col in wines.columns for col in redund_cols):
  cleaned_wines = wines.drop(redund_cols, axis=1)

Step 3

In [207]:
# Creating frame for Regions
regions_db = cleaned_wines[['RegionID', 'RegionName', 'WineID']]

# Creating frame for Rating
rating_db = ratings[['RatingID', 'WineID', 'Rating']]

# Creating frame for Harmonize
harmonize_db = cleaned_wines[['WineID', 'Harmonize']]
harmonize_db = harmonize_db.explode('Harmonize')
harmonize_db['HarmonizeID'] = list(range(1000001, 1000001
                                         + len(harmonize_db['Harmonize'])))

Step 4

In [208]:
# Calc mean rating
mean_rating = rating_db.groupby(['WineID']).agg({'Rating': 'mean'}) \
                     .reset_index().round(2)

# Merge wine & ratings
cleaned_wines['Rating'] = mean_rating['Rating']

Step 5

In [209]:
# Create frame - counting names
wine_name_freq = cleaned_wines['WineName'].value_counts()

# Edit frame – shape & colnames
wine_name_freq = wine_name_freq.to_frame().reset_index() \
                 .rename(columns = {'index': 'WineName', 'WineName': 'count'})

# Filter duplicated names
dupl_names = wine_name_freq.query('count > 1')

# Select duplicated names in main data-set
dupl_names_frame = cleaned_wines[cleaned_wines['WineName'].isin(dupl_names['WineName'])]



# Create acronym for each region name
cleaned_wines['WineryAcronym'] = cleaned_wines['WineryName'].apply(create_acronym)

# Combine wine name with its region acronym
cleaned_wines['WineName'] = cleaned_wines['WineName'] + ' ' + cleaned_wines['WineryAcronym']

# Drop the temporary column
if 'WineryAcronym' in cleaned_wines.columns:
  cleaned_wines.drop('WineryAcronym', axis=1, inplace=True)

Step 6

In [210]:
# Creating frame for Harmonize
wine_ratings_db = cleaned_wines.drop(columns = {'RegionID', 'RegionName',
                                               'WineryName'})

In [211]:
# Additional for Tableau – change float types by comma
wine_ratings_db['ABV']= wine_ratings_db['ABV'].astype("string")
wine_ratings_db['Rating'] = wine_ratings_db['Rating'].astype('string')

wine_ratings_db['ABV'] = wine_ratings_db['ABV'].str.replace('.', ',', regex=True)
wine_ratings_db['Rating'] = wine_ratings_db['Rating'].str.replace('.', ',', regex=True)