# Lab5 dataset library

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
!pip install datasets evaluate transformers[sentencepiece]

  from pkg_resources import load_entry_point




## Load dataset from local csv

In [2]:
from datasets import load_dataset

data_file = "wine-dataset.csv"
# \t is the tab character in Python
wine_dataset = load_dataset("csv", data_files=data_file, delimiter=",")
wine_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'country', 'description', 'designation', 'points', 'price', 'province', 'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title', 'variety', 'winery'],
        num_rows: 86148
    })
})

### rename_column
you need to rename the first column Unnamed: 0 to wine_id

In [3]:
wine_dataset = wine_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="wine_id"
)
wine_dataset

DatasetDict({
    train: Dataset({
        features: ['wine_id', 'country', 'description', 'designation', 'points', 'price', 'province', 'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title', 'variety', 'winery'],
        num_rows: 86148
    })
})

### remove_column
you need to remove the unused column after price 

In [4]:
wine_dataset = wine_dataset.remove_columns(
	column_names=["province", "region_1", "region_2", "taster_name", "taster_twitter_handle", "title", "variety", "winery"]
)

### use map function 
you need to use map function to turn 'country' 'description' to lowercase

In [5]:
def to_lowercase(example):
    example['description'] = example['description'].lower()
    example['country'] = example['country'].lower()
    return example
wine_dataset = wine_dataset.map(to_lowercase)

In [6]:
split_dataset = wine_dataset["train"].train_test_split(test_size=0.2)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]
print(train_dataset)
print(test_dataset)	

Dataset({
    features: ['wine_id', 'country', 'description', 'designation', 'points', 'price'],
    num_rows: 68918
})
Dataset({
    features: ['wine_id', 'country', 'description', 'designation', 'points', 'price'],
    num_rows: 17230
})


In [7]:
wine_sample = train_dataset.shuffle(seed=42).select(range(10))
# Peek at the first few examples
print(wine_sample["wine_id"])
print(wine_sample["country"])

[7049, 11011, 106846, 22388, 88161, 10774, 70560, 129315, 25980, 7577]
['portugal', 'us', 'us', 'croatia', 'south africa', 'us', 'portugal', 'us', 'us', 'chile']


### Use filter function
you need to use filter function and try to write two lambda function to get 2 subset 
1. country of wine is italy
2. points of wine is > 90   

In [8]:
# Perform basic operations
# Example: Filter for wines from Italy
italian_wines = train_dataset.filter(lambda example: example['country'] == 'italy')
print("\nFiltered Italian wines sample:")
print(italian_wines[:3])
# Example: Select wines with more than 90 points
highly_rated_wines = train_dataset.filter(lambda example: example['points'] > 90)
highly_rated_wines = highly_rated_wines.sort("points", reverse=True)
print("\nHighly rated wines sample:")
print(highly_rated_wines[:3])

Filter:   0%|          | 0/68918 [00:00<?, ? examples/s]


Filtered Italian wines sample:
{'wine_id': [57581, 108811, 62886], 'country': ['italy', 'italy', 'italy'], 'description': ['this concentrated wine offers aromas of tobacco leaf, stewed plum, grilled porcini mushroom and sage. the lush palate delivers blackberry extract layered with notes of chocolate and vanilla alongside velvety, embracing tannins.', 'savory spice and candied fruit notes emerge on the bouquet of this sophisticated wine. it shows tight, silky tannins and delivers a brightly intense finish with notes of dried ginger, cedar and licorice.', 'serpico is a beautiful but also modern expression of aglianico (the grapes come from ancient vines in the taurasi area) that opens with dark notes of blackberry, crushed stone, chocolate, rum cake, licorice and cola. the mouthfeel is smooth, firm and very rich.'], 'designation': ['Bosan Riserva', "Rocche dell'Annunziata", 'Serpico'], 'points': [93, 92, 92], 'price': [115.0, 75.0, 80.0]}


Filter:   0%|          | 0/68918 [00:00<?, ? examples/s]


Highly rated wines sample:
{'wine_id': [7335, 36528, 42197], 'country': ['italy', 'france', 'portugal'], 'description': ["thick as molasses and dark as caramelized brown sugar, the wine oozes out of the bottle releasing concentrated aromas of butterscotch, toffee, honey, licorice, coffee, resin and maple syrup. grapes from the montepulciano area are dried in a ventilated room and the wine ages over 10 years in tiny oak barrels resulting in thick, dark concentration. don't pair it with food: vin santo as good as this deserves to be enjoyed alone, as a so-called “meditation wine.”", 'this is a fabulous wine from the greatest champagne vintage so far this century. still young, it blends almost equal amounts of pinot noir and chardonnay fleshed out with pinot meunier. the minerality and the rich apple and green fruits are balanced, and acidity cuts into the wine with a pure, perfumed line of freshness. the wine could be drunk now, but it will age well into the the 2020s.', "this is the la

## Dataset library with pandas

In [9]:
train_dataset.set_format("pandas")

In [10]:
train_dataset[:3]

Unnamed: 0,wine_id,country,description,designation,points,price
0,37157,us,golden yellow coloring and deeply perfumed not...,Skin Ferment,85,27.0
1,57581,italy,this concentrated wine offers aromas of tobacc...,Bosan Riserva,93,115.0
2,108467,us,"with each new vintage of cuvée elena, the blen...",Cuvée Elena,94,40.0


In [11]:
train_df = train_dataset[:]

In [12]:
frequencies = (
    train_df["points"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"count": "frequency"})
)
frequencies.head()

Unnamed: 0,points,frequency
0,88,8802
1,90,8490
2,87,8406
3,91,6634
4,89,6542


In [13]:
train_dataset.reset_format()

## Push your own dataset to hub

In [14]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:

from datasets import DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# Push to the Hub
dataset_dict.push_to_hub('your_hungginface account/wine_review')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/69 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/586 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Wellyowo/wine_review/commit/d969f5fdf3846db6b6b722bf9bb4952bea454e7c', commit_message='Upload dataset', commit_description='', oid='d969f5fdf3846db6b6b722bf9bb4952bea454e7c', pr_url=None, pr_revision=None, pr_num=None)

In [16]:
remote_dataset = load_dataset('your_hungginface account/wine_review')
remote_dataset

Downloading readme:   0%|          | 0.00/586 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.01M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/68918 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/17230 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['wine_id', 'country', 'description', 'designation', 'points', 'price'],
        num_rows: 68918
    })
    test: Dataset({
        features: ['wine_id', 'country', 'description', 'designation', 'points', 'price'],
        num_rows: 17230
    })
})