## Useful properties and methods

In [11]:
import pandas as pd
import numpy as np

## Create DF:

In next examples we'll be using the `drinks` dataset.

You can get the 'drinks.csv' file from https://github.com/geekcourses/JupyterNotebooksExamples/blob/master/datasets/various/drinks.csv

In [12]:
data = pd.read_csv("./datasets/drinks.csv")
data.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF
3,Andorra,245,138,312,12.4,EU
4,Angola,217,57,45,5.9,AF


In [15]:
data['country'].value_counts()["Albania"]

np.int64(1)

In [17]:
data = {
        'name': ['Maria', 'Petyr', 'Ivan'],
        'age': [23, 21, 34],
        'gender': ['female', 'male', 'male']
    }

index = ['a', 'b', 'c']

df = pd.DataFrame(data, index=index)

print( df['age'].max() )


34


## get indexes and columns

In [4]:
# get rows index
index_labels = data.index

print(index_labels)


RangeIndex(start=0, stop=193, step=1)


In [6]:
# get columns labels

print(data.columns)
# print(list(column_labels))

Index(['country', 'beer_servings', 'spirit_servings', 'wine_servings',
       'total_litres_of_pure_alcohol', 'continent'],
      dtype='object')


## DF shape


df.shape in pandas returns a tuple indicating the number of rows and columns in the DataFrame df, formatted as (rows, columns)

In [7]:
data.shape

(193, 6)

## DF info

df.info() in pandas provides a concise summary of a DataFrame df, including the index dtype and column dtypes, non-null counts, and memory usage. It's useful for getting a quick overview of the DataFrame's structure and data types.

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     170 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 9.2+ KB


## DF describe

df.describe() generates descriptive statistics that summarize the central tendency, dispersion, and shape of a dataset’s distribution, excluding NaN values. It's applicable to numeric columns by default but can also describe object columns or all columns if specified. It provides information such as mean, standard deviation, min, max, and quartiles.

In [10]:
data.describe()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
count,193.0,193.0,193.0,193.0
mean,106.160622,80.994819,49.450777,4.717098
std,101.143103,88.284312,79.697598,3.773298
min,0.0,0.0,0.0,0.0
25%,20.0,4.0,1.0,1.3
50%,76.0,56.0,8.0,4.2
75%,188.0,128.0,59.0,7.2
max,376.0,438.0,370.0,14.4


note that all parameters in describe, are also defined as a separate functions:

In [9]:
data['wine_servings'].mean()

np.float64(49.45077720207254)

## Get only numerical columns:

if you need to get only coumns with numeric data you can use `pd.np.number` as a shorthand for selecting numeric dtypes (e.g., integer, float) supported by pandas.

In [13]:
data_numeric = data.select_dtypes(include=np.int64)
data_numeric.head()

Unnamed: 0,beer_servings,spirit_servings,wine_servings
0,0,0,0
1,89,132,54
2,25,0,14
3,245,138,312
4,217,57,45


In [14]:
import pandas as pd

# Create a sample DataFrame
df = pd.DataFrame({
    'a': [1, 2] * 3,
    'b': [True, False] * 3,
    'c': [1.0, 2.0] * 3
})

# Select boolean columns
bool_columns = df.select_dtypes(include='bool')
print(bool_columns)
# Select float64 columns
float_columns = df.select_dtypes(include='float64')
print(float_columns)
print(type(float_columns))
# Exclude int64 columns
non_int_columns = df.select_dtypes(exclude=['int64'])
print(non_int_columns)
print(type(non_int_columns))


       b
0   True
1  False
2   True
3  False
4   True
5  False
     c
0  1.0
1  2.0
2  1.0
3  2.0
4  1.0
5  2.0
<class 'pandas.core.frame.DataFrame'>
       b    c
0   True  1.0
1  False  2.0
2   True  1.0
3  False  2.0
4   True  1.0
5  False  2.0
<class 'pandas.core.frame.DataFrame'>


### Statistic notes: mean vs DF.median

The mean (average) of a data set is found by adding all numbers in the data set and then dividing by the number of values in the set. 

The median is the middle value when a <span style="color:red">data set is ordered</span> from least to greatest. If the number of observations is even, then the median is the simple average of the middle two numbers. 

In [15]:
test_df = pd.DataFrame({
	"userName":['Pesho', 'Maria','Ivan'],
	"age": [20, 25, 60]
})
test_df

Unnamed: 0,userName,age
0,Pesho,20
1,Maria,25
2,Ivan,60


In [16]:
mean_age = test_df.age.mean()
median_age = test_df.age.median()

print(mean_age, median_age)

35.0 25.0


## Selecting by max/min values in DF

In [17]:
# Get maximum values of every column
data_numeric.max()

beer_servings      376
spirit_servings    438
wine_servings      370
dtype: int64

In [18]:
# find max values for wine, beer and pure_alcohol
max_wine_servings = data.wine_servings.max()
print(f'max_wine_servings: {max_wine_servings}')

max_pure_alcohol = data.total_litres_of_pure_alcohol.max()
print(f'max_pure_alcohol: {max_pure_alcohol}')

max_beer_servings = data.beer_servings.max()
print(f'max_beer_servings: {max_beer_servings}')

max_wine_servings: 370
max_pure_alcohol: 14.4
max_beer_servings: 376


### Get max values of row(s)

must use df.max(axis=1)

In [21]:
data_numeric

Unnamed: 0,beer_servings,spirit_servings,wine_servings
0,0,0,0
1,89,132,54
2,25,0,14
3,245,138,312
4,217,57,45
...,...,...,...
188,333,100,3
189,111,2,1
190,6,0,0
191,32,19,4


In [22]:
data_numeric.max(axis=1).head(5)
data_numeric.max(axis=0).head(5)

beer_servings      376
spirit_servings    438
wine_servings      370
dtype: int64

In [23]:
# find max value from alcohol servings columns in row for Bulgaria
alcohol_servings_columns = ['beer_servings','spirit_servings','wine_servings']

bulgaria_data = data.loc[data.country == 'Bulgaria', alcohol_servings_columns]
# print(bulgaria_data)
bulgaria_data.max(axis=1)

25    252
dtype: int64

### Get label/coulmn of max value over row/column: df.idxmax()

Reference: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.idxmax.html

#### Exmple: find max value from alcohol servings columns for Bulgaria

In [25]:
bulgaria_data

Unnamed: 0,beer_servings,spirit_servings,wine_servings
25,231,252,94


In [26]:
bulgaria_data 
# print(bulgaria_data)
bulgaria_data.idxmax(axis=1)

25    spirit_servings
dtype: object

### Example: find country with max wine servings

In [27]:
wine_max_idx = data.wine_servings.idxmax()
print(wine_max_idx)
data.iloc[wine_max_idx,:]

61


country                         France
beer_servings                      127
spirit_servings                    151
wine_servings                      370
total_litres_of_pure_alcohol      11.8
continent                           EU
Name: 61, dtype: object

## Grouping

Reference: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html

### Example: find the max wine_servings per each continent

In [30]:
data.groupby('continent').wine_servings.sum()

continent
AF     862
AS     399
EU    6400
OC     570
SA     749
Name: wine_servings, dtype: int64

In [17]:
data.groupby('continent').wine_servings.max()

continent
AF    233
AS    123
EU    370
OC    212
SA    221
Name: wine_servings, dtype: int64

## Detect missing values

In [31]:
data.isnull()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
188,False,False,False,False,False,False
189,False,False,False,False,False,False
190,False,False,False,False,False,False
191,False,False,False,False,False,False


In [34]:
# check weather any missing values exists:
print(data.isnull().values.any())

# show the sum of missing values per column:
print(data.isnull().sum())


True
country                          0
beer_servings                    0
spirit_servings                  0
wine_servings                    0
total_litres_of_pure_alcohol     0
continent                       23
dtype: int64


data[data[['beer_servings', 'wine_servings']] == 0]## Tasks

1. Select country names of non european countries which have wine servings above the mean
2. Select country names for countries with 0 wine and beer servings
3. Select country name with max wine_servings in SA
<!-- 4. Stack Overflow Annual Developer Survey Insights
   1. Get familiar with Survay Data Schema and load the dataset for 2022 year
   2. Find the mean and median for Salary -->

In [35]:
data 

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF
3,Andorra,245,138,312,12.4,EU
4,Angola,217,57,45,5.9,AF
...,...,...,...,...,...,...
188,Venezuela,333,100,3,7.7,SA
189,Vietnam,111,2,1,2.0,AS
190,Yemen,6,0,0,0.1,AS
191,Zambia,32,19,4,2.5,AF


In [53]:
non_eu_data = data[data['continent'] != 'EU']
non_eu_data
wine_servings_mean = non_eu_data.wine_servings.mean()
print(wine_servings_mean)
non_eu_data[non_eu_data.wine_servings > wine_servings_mean].sort_values(by='wine_servings')

21.243243243243242


Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
129,Palau,306,63,23,6.9,OC
146,Samoa,105,18,24,2.6,OC
50,Dominica,52,286,26,6.6,
68,Grenada,199,438,28,11.9,
94,Lebanon,20,55,31,1.9,AS
143,St. Kitts & Nevis,194,205,32,7.7,
177,Turkmenistan,19,71,32,2.2,AS
22,Botswana,173,35,35,5.4,AF
14,Barbados,143,173,36,6.3,
4,Angola,217,57,45,5.9,AF


In [54]:
data

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF
3,Andorra,245,138,312,12.4,EU
4,Angola,217,57,45,5.9,AF
...,...,...,...,...,...,...
188,Venezuela,333,100,3,7.7,SA
189,Vietnam,111,2,1,2.0,AS
190,Yemen,6,0,0,0.1,AS
191,Zambia,32,19,4,2.5,AF


In [73]:
# data[['beer_servings', 'wine_servings']] == 0
data[(data[['beer_servings', 'wine_servings']] == 0).all(axis=1)] 

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,AS
13,Bangladesh,0,0,0,0.0,AS
46,North Korea,0,0,0,0.0,AS
79,Iran,0,0,0,0.0,AS
90,Kuwait,0,0,0,0.0,AS
97,Libya,0,0,0,0.0,AF
103,Maldives,0,0,0,0.0,AS
106,Marshall Islands,0,0,0,0.0,OC
107,Mauritania,0,0,0,0.0,AF
111,Monaco,0,0,0,0.0,EU


In [83]:
data[(data.wine_servings == data[data['continent'] == 'SA'].wine_servings.max()) & (data.continent == 'SA')]

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
6,Argentina,193,25,221,8.3,SA


In [9]:
def add(a, b):
    print(a + b, end=";")
res = add(1,2)
print(res)



3;None
