In [1]:
# testGroupby.ipynb
import pandas as pd

df = pd.read_csv('c:\\work2\\gapminder.tsv', sep='\t')

In [2]:
avg_life_exp_by_year = df.groupby("year").lifeExp.mean()
print(avg_life_exp_by_year)

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64


In [3]:
years = df.year.unique()
print(years)

[1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 2002 2007]


In [4]:
#분할(필터링)
y1952 = df.loc[df.year == 1982, :]
print(y1952.head())

        country continent  year  lifeExp       pop    gdpPercap
6   Afghanistan      Asia  1982   39.854  12881816   978.011439
18      Albania    Europe  1982   70.420   2780097  3630.880722
30      Algeria    Africa  1982   61.368  20033753  5745.160213
42       Angola    Africa  1982   39.942   7016384  2756.953672
54    Argentina  Americas  1982   69.942  29341374  8997.897412


In [5]:
y1952_mean = y1952.lifeExp.mean()
print(y1952_mean)

61.53319718309859


In [7]:
y1957 = df.loc[df.year == 1957, :]
y1957_mean = y1957.lifeExp.mean()
print(y1957_mean)

51.50740112676056


In [8]:
y1962 = df.loc[df.year == 1962, :]
y1962_mean = y1962.lifeExp.mean()
print(y1962_mean)

53.609249014084504


In [9]:
y2007 = df.loc[df.year == 2007, :]
y2007_mean = y2007.lifeExp.mean()
print(y2007_mean)

67.00742253521126


In [10]:
#결합하는 단계
df2 = pd.DataFrame({'year':[1952,1957,1962,2007],'myMean':[y1952_mean,y1957_mean,y1962_mean,y2007_mean]})
print(df2)

   year     myMean
0  1952  61.533197
1  1957  51.507401
2  1962  53.609249
3  2007  67.007423


In [12]:
#직접 집계 함수를 정의
def my_mean(values):
    n = len(values)
    sum = 0
    for value in values:
        sum += value
    return sum / n

In [13]:
agg_my_mean = df.groupby('year').lifeExp.agg(my_mean)
print(agg_my_mean)

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64


In [14]:
#이번에는 2개의 인자값을 받아 처리하는 사용자정의함수를 만들어본다.
#다음은 첫번째 인자로 받은 열의 평균값을 구하여 두번째 인자로 받은 값과의
#차이를 계산한 다음 반환하는 함수이다.
def my_mean_diff(values, diff_value):
    n = len(values)
    sum = 0
    for value in values:
        sum += value 
    mean = sum / n 
    return mean - diff_value

In [15]:
#전체 평균
global_mean = df.lifeExp.mean()
print(global_mean)

59.474439366197174


In [16]:
agg_mean_diff = df.groupby('year').lifeExp.agg(my_mean_diff, diff_value = global_mean)
print(agg_mean_diff)

year
1952   -10.416820
1957    -7.967038
1962    -5.865190
1967    -3.796150
1972    -1.827053
1977     0.095718
1982     2.058758
1987     3.738173
1992     4.685899
1997     5.540237
2002     6.220483
2007     7.532983
Name: lifeExp, dtype: float64


In [17]:
import seaborn as sns
import numpy as np

np.random.seed(42)
tips_10 = sns.load_dataset('tips').sample(10)
tips_10.loc[np.random.permutation(tips_10.index)[:4], 'total_bill'] = np.NaN
print(tips_10)

     total_bill   tip     sex smoker   day    time  size
24        19.82  3.18    Male     No   Sat  Dinner     2
6          8.77  2.00    Male     No   Sun  Dinner     2
153         NaN  2.00    Male     No   Sun  Dinner     4
211         NaN  5.16    Male    Yes   Sat  Dinner     4
198         NaN  2.00  Female    Yes  Thur   Lunch     2
176         NaN  2.00    Male    Yes   Sun  Dinner     2
192       28.44  2.56    Male    Yes  Thur   Lunch     2
124       12.48  2.52  Female     No  Thur   Lunch     2
9         14.78  3.23    Male     No   Sun  Dinner     2
101       15.38  3.00  Female    Yes   Fri  Dinner     2


In [18]:
count_sex = tips_10.groupby('sex').count()
print(count_sex)

        total_bill  tip  smoker  day  time  size
sex                                             
Male             4    7       7    7     7     7
Female           2    3       3    3     3     3


In [22]:
def fill_na_mean(x):
    avg = x.mean()
    return x.fillna(avg)

In [24]:
total_bill_group_mean = tips_10.groupby('sex').total_bill.transform(fill_na_mean)
tips_10['fill_total_bill'] = total_bill_group_mean
print(tips_10)

     total_bill   tip     sex smoker   day    time  size  fill_total_bill
24        19.82  3.18    Male     No   Sat  Dinner     2          19.8200
6          8.77  2.00    Male     No   Sun  Dinner     2           8.7700
153         NaN  2.00    Male     No   Sun  Dinner     4          17.9525
211         NaN  5.16    Male    Yes   Sat  Dinner     4          17.9525
198         NaN  2.00  Female    Yes  Thur   Lunch     2          13.9300
176         NaN  2.00    Male    Yes   Sun  Dinner     2          17.9525
192       28.44  2.56    Male    Yes  Thur   Lunch     2          28.4400
124       12.48  2.52  Female     No  Thur   Lunch     2          12.4800
9         14.78  3.23    Male     No   Sun  Dinner     2          14.7800
101       15.38  3.00  Female    Yes   Fri  Dinner     2          15.3800


In [25]:
tips = sns.load_dataset('tips')
print(tips.shape)

(244, 7)


In [26]:
print(tips['size'].value_counts())

2    156
3     38
4     37
5      5
1      4
6      4
Name: size, dtype: int64


In [30]:
tips_filtered = tips.groupby('size').filter(lambda x:x['size'].count() >= 30)
print(tips_filtered.shape)
print(tips_filtered['size'].value_counts())

(231, 7)
2    156
3     38
4     37
Name: size, dtype: int64


In [32]:
#일반함수로 정의
def myFilter(x):
    return x['size'].count() >= 30

tips_filtered = tips.groupby('size').filter(myFilter)
print(tips_filtered.shape)
print(tips_filtered['size'].value_counts())

(231, 7)
2    156
3     38
4     37
Name: size, dtype: int64
