In [1]:
import pandas as pd
from glob import glob
import numpy as np

# Load all files for this assignemt

In [2]:
files = glob("ex5/*.csv")
files

['ex5\\billboard.csv',
 'ex5\\customers.csv',
 'ex5\\orders.csv',
 'ex5\\ramen-ratings.csv',
 'ex5\\winemag-data-130k-v2.csv']

# Exercise 1

#### Step 1

In [3]:
df = pd.read_csv(files[-1], index_col=0)
df.head(2)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos


##### Load only Prosseco review

In [4]:
prosseco_reviews = df[df['variety']=='Prosecco']
prosseco_reviews.head(2)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
315,Italy,"Honey, almond blossom and citrus zest are foll...",Extra Dry,86,15.0,Veneto,Prosecco di Valdobbiadene,,,,Bellussi NV Extra Dry (Prosecco di Valdobbiad...,Prosecco,Bellussi
319,Italy,Paladin produces a luminous and fresh Prosecco...,Millesimato Brut,86,20.0,Veneto,Veneto,,,,Paladin 2007 Millesimato Brut Prosecco (Veneto),Prosecco,Paladin


#### Step 2

##### Data frame with points more then 89

In [5]:
columns = ['title', 'price', 'points']

In [6]:
good_prosecco = prosseco_reviews[prosseco_reviews['points'] > 89][columns]
good_prosecco.head(4)

Unnamed: 0,title,price,points
9934,Sorelle Bronca NV Extra Dry Particella 68 (Pr...,,90
40904,Bortolomiol 2008 Cartizze Dry (Prosecco Super...,30.0,90
40916,Nino Franco 2007 Rive di San Floriano Brut (P...,30.0,90
47311,Ruggeri & C. 2007 Giustino B. Extra Dry (Pros...,36.0,91


##### Data frame where point is less then 85

In [7]:
bad_prosecco = prosseco_reviews[prosseco_reviews['points'] < 85][columns]
bad_prosecco.head(4)

Unnamed: 0,title,price,points
7159,Tosti NV Prosecco (Italy),15.0,83
15600,Le Vigne di Alice 2007 Millesimato Doro Brut ...,,81
36713,Cantina San Martino NV Pittaro Extra Dry (Pro...,15.0,84
39925,Lisabella NV Gran Resèe Prosecco (Colli Trevig...,12.0,84


#### Step 3
Add title length

In [8]:
good_prosecco = good_prosecco.assign(
    title_length = good_prosecco['title'].apply(lambda x: len(x))
)
good_prosecco.head(4)

Unnamed: 0,title,price,points,title_length
9934,Sorelle Bronca NV Extra Dry Particella 68 (Pr...,,90,70
40904,Bortolomiol 2008 Cartizze Dry (Prosecco Super...,30.0,90,63
40916,Nino Franco 2007 Rive di San Floriano Brut (P...,30.0,90,71
47311,Ruggeri & C. 2007 Giustino B. Extra Dry (Pros...,36.0,91,68


In [9]:
good_char_mean = good_prosecco['title_length'].mean()
print(f"Average title length for good wine is {good_char_mean}")

Average title length for good wine is 57.588235294117645


In [10]:
bad_prosecco = bad_prosecco.assign(
    title_length = bad_prosecco['title'].apply(lambda x: len(x))
)
bad_prosecco.head(4)

Unnamed: 0,title,price,points,title_length
7159,Tosti NV Prosecco (Italy),15.0,83,25
15600,Le Vigne di Alice 2007 Millesimato Doro Brut ...,,81,73
36713,Cantina San Martino NV Pittaro Extra Dry (Pro...,15.0,84,63
39925,Lisabella NV Gran Resèe Prosecco (Colli Trevig...,12.0,84,51


In [11]:
bad_char_mean = bad_prosecco['title_length'].mean()
print(f"Average title length for good wine is {bad_char_mean}")

Average title length for good wine is 53.73913043478261


# Exercise 2

#### Step 1

In [12]:
df2 = pd.read_csv(files[3])
df2.head(2)

Unnamed: 0,Review #,Brand,Variety,Style,Country,Stars,Top Ten
0,2580,New Touch,T's Restaurant Tantanmen,Cup,Japan,3.75,
1,2579,Just Way,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...,Pack,Taiwan,1.0,


##### Inspect the data frame missing values and data type

In [13]:
df2.isnull().sum()

Review #       0
Brand          0
Variety        0
Style          2
Country        0
Stars          0
Top Ten     2539
dtype: int64

In [14]:
df2.dtypes

Review #     int64
Brand       object
Variety     object
Style       object
Country     object
Stars       object
Top Ten     object
dtype: object

In [15]:
stars = list(df2['Stars'])
container = []
for s in stars:
    try:
        float(s)
    except ValueError:
        container.append(s)
print(f"uncasting values: {container}")

uncasting values: ['Unrated', 'Unrated', 'Unrated']


In [16]:
df2 = df2[df2['Stars'] != 'Unrated']

In [17]:
df2 = df2.astype({'Stars': 'float64'})
df2.dtypes

Review #      int64
Brand        object
Variety      object
Style        object
Country      object
Stars       float64
Top Ten      object
dtype: object

In [18]:
def q10(y):
    return np.quantile(y, q=0.1)


def q90(y):
    return np.quantile(y, q=0.9)

In [21]:
df2_group_by_country_mean_qunatiles = df2.groupby('Country').Stars.agg(['mean', q10, q90])
df2_group_by_country_mean_qunatiles

Unnamed: 0_level_0,mean,q10,q90
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Australia,3.138636,2.025,4.0
Bangladesh,3.714286,3.25,4.0
Brazil,4.35,4.0,4.8
Cambodia,4.2,3.5,5.0
Canada,2.243902,0.25,3.5
China,3.421893,1.75,4.5
Colombia,3.291667,2.875,3.625
Dubai,3.583333,3.35,3.75
Estonia,3.5,3.3,3.7
Fiji,3.875,3.475,4.175
