In [82]:
%matplotlib inline
import matplotlib.pyplot as plt
from scipy import stats

In [7]:
import pandas as pd
import numpy as np

drinks = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT8/master/data/drinks.csv')
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF
3,Andorra,245,138,312,12.4,EU
4,Angola,217,57,45,5.9,AF


In [45]:
# 결측데이터 처리하기 : 기타 대륙으로 통합
drinks.isnull().sum()
drinks['continent']=drinks['continent'].astype(str)
drinks['continent'].replace('nan','Others',regex=True).head(10)

0        AS
1        EU
2        AF
3        EU
4        AF
5    Others
6        SA
7        EU
8        OC
9        EU
Name: continent, dtype: object

In [47]:
# 전체 평균보다 많은 알코올을 섭취하는 대륙 구하기
avg = drinks['total_litres_of_pure_alcohol'].mean()
continent_mean = drinks.groupby('continent').total_litres_of_pure_alcohol.mean()
print(continent_mean)
continent_over = continent_mean[continent_mean>avg]
continent_over

continent
AF     3.007547
AS     2.170455
EU     8.617778
OC     3.381250
SA     6.308333
nan    5.995652
Name: total_litres_of_pure_alcohol, dtype: float64


continent
EU     8.617778
SA     6.308333
nan    5.995652
Name: total_litres_of_pure_alcohol, dtype: float64

In [44]:
# 평균 beer_servings이 가장 높은 대륙 구하기
beer_continent = drinks.groupby('continent').beer_servings.mean().idxmax()
continent_avg_bs = drinks.groupby('continent').beer_servings.mean().sort_values(ascending=False)[0]

print(beer_continent,' : ',continent_avg_bs)

EU  :  193.77777777777777


In [48]:
# 대륙별 spirit_servings의 평균, 최소, 최대, 합계 구하기
result = drinks.groupby('continent').spirit_servings.agg(['mean', 'max', 'min', 'sum'])
result

Unnamed: 0_level_0,mean,max,min,sum
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AF,16.339623,152,0,866
AS,60.840909,326,0,2677
EU,132.555556,373,0,5965
OC,58.4375,254,0,935
SA,114.75,302,25,1377
,165.73913,438,68,3812


In [111]:
# 술 소비량 대비 알콜 비율에 대한 칼럼 만들어서 병합하기 -> 독하게 술을 마시는 나라
drinks['alcohol_rate'] = drinks['total_litres_of_pure_alcohol'] / (drinks['beer_servings']+drinks['spirit_servings']+drinks['wine_servings'])
drinks['alcohol_rate'] = drinks['alcohol_rate'].fillna(0)
drinks['alcohol_rank'] = drinks['alcohol_rate'].rank(ascending=False)
drinks['alcohol_rank'] = drinks['alcohol_rank'].apply(np.floor)
drinks.head()
# drinks['alcohol_rate'].sort_values(ascending=False)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent,alcohol_rate,alcohol_rank
0,Afghanistan,0,0,0,0.0,AS,0.0,187.0
1,Albania,89,132,54,4.9,EU,0.017818,120.0
2,Algeria,25,0,14,0.7,AF,0.017949,77.0
3,Andorra,245,138,312,12.4,EU,0.017842,114.0
4,Angola,217,57,45,5.9,AF,0.018495,49.0


In [119]:
# 전체 순위 중 한국의 순위 구하기
drinks.loc[drinks['country'] == 'South Korea'].alcohol_rank
drinks['alcohol_rank'].iloc[0]

187.0

In [121]:
country_rank = drinks[['country','alcohol_rank']]
country_rank = country_rank.sort_values(by=['alcohol_rank'],ascending=True)
country_rank.head(16)

Unnamed: 0,country,alcohol_rank
63,Gambia,1.0
153,Sierra Leone,2.0
124,Nigeria,3.0
179,Uganda,4.0
142,Rwanda,5.0
183,Tanzania,6.0
26,Burkina Faso,7.0
33,Central African Republic,8.0
28,Cote d'Ivoire,9.0
104,Mali,10.0


In [122]:
# 아시아와 유럽간의 술 소비량 대비 알콜 비율의 차이 검정하기

asia = drinks.loc[drinks['continent']=='AS']
europe = drinks.loc[drinks['continent']=='EU']

tTestResult = stats.ttest_ind(asia['alcohol_rate'],europe['alcohol_rate'])
tTestResult

Ttest_indResult(statistic=-0.7364127575786211, pvalue=0.46346062556694645)

In [124]:
tTestResultDiffVar = stats.ttest_ind(asia['alcohol_rate'], europe['alcohol_rate'], equal_var=False)
tTestResultDiffVar

Ttest_indResult(statistic=-0.7304873446825627, pvalue=0.4680909670453398)