In [2]:
import pandas as pd
import numpy as np
import os

df = pd.read_pickle(os.path.join('_datasets', 'data_frame.pickle'))

In [3]:
# ITERATION
small_df = df.iloc[49980:50019, :].copy()
grouped = small_df.groupby('artist')
type(grouped)

pandas.core.groupby.DataFrameGroupBy

In [4]:
for name, group_df in grouped:
    print(name)
    print(group_df)
    break

Frost, Sir Terry
                artist            title               medium  year  \
id                                                                   
4704  Frost, Sir Terry        Blue Moon  Lithograph on paper  1952   
4705  Frost, Sir Terry      Boat Shapes     Linocut on paper  1952   
4706  Frost, Sir Terry      Boat Shapes     Linocut on paper  1954   
4707  Frost, Sir Terry      Boat Shapes     Linocut on paper  1954   
4708  Frost, Sir Terry            Leeds    Drypoint on paper  1956   
4709  Frost, Sir Terry  Camping, Anduze     Etching on paper  1979   
4710  Frost, Sir Terry     Umea, Sweden     Etching on paper  1979   
4711  Frost, Sir Terry    Self-Portrait     Etching on paper  1980   

      acquisitionYear width height units  
id                                        
4704           1983.0   355    273    mm  
4705           1983.0   132    143    mm  
4706           1983.0   131    155    mm  
4707           1983.0   193    267    mm  
4708           1983.0   

In [5]:
# Aggregate
# Mins
for name, group_df in small_df.groupby('artist'): 
    min_year = group_df['acquisitionYear'].min()
    print("{}: {}".format(name, min_year))

Frost, Sir Terry: 1983.0
Phillips, Esq Tom: 1983.0
Wols: 1983.0


In [6]:
# Transform
# Equivalent of editing by hand:
# Make a case when there is no data to infer
# small_df.loc[[11838, 16441], 'medium'] = np.nan
def fill_values(series):
    values_counted = series.value_counts()
    if values_counted.empty:
        return series
    most_frequent = values_counted.index[0]
    new_medium = series.fillna(most_frequent)
    return new_medium

In [7]:
def transform_df(source_df):
    group_dfs = []  
    for name, group_df in source_df.groupby('artist'):
        filled_df = group_df.copy()
        filled_df.loc[:, 'medium'] = fill_values(group_df['medium'])
        group_dfs.append(filled_df)
    
    new_df = pd.concat(group_dfs)
    return new_df

In [8]:
# Now check the result
filled_df = transform_df(small_df)

In [9]:
# BUILT-INS
# Transform
grouped_mediums = small_df.groupby('artist')['medium']
small_df.loc[:, 'medium'] = grouped_mediums.transform(fill_values)


In [11]:
# Min
df.groupby('artist').agg(np.min)


Unnamed: 0_level_0,title,acquisitionYear
artist,Unnamed: 1_level_1,Unnamed: 2_level_1
?British School,"Portrait of a Gentleman, probably of the West ...",1927.0
"Abakanowicz, Magdalena",Abakan Orange,2009.0
"Abbey, Edwin Austin",Illustration to ‘Judith Shakespeare’,1924.0
"Abbott, Berenice",Dinty Moore Antiques,2010.0
"Abbott, Lemuel Francis","Henry Byne, of Carshalton",1885.0
"Abrahams, Ivor",A Dream Within a Dream,1975.0
Absalon,Assassinations,1997.0
"Abts, Tomma",Noeme,2006.0
"Acconci, Vito",3 Flags for 1 Space and 6 Regions,1982.0
"Ackling, Roger",Five Sunsets in One Hour,1983.0


In [12]:
df.groupby('artist').min()

Unnamed: 0_level_0,title,acquisitionYear
artist,Unnamed: 1_level_1,Unnamed: 2_level_1
?British School,"Portrait of a Gentleman, probably of the West ...",1927.0
"Abakanowicz, Magdalena",Abakan Orange,2009.0
"Abbey, Edwin Austin",Illustration to ‘Judith Shakespeare’,1924.0
"Abbott, Berenice",Dinty Moore Antiques,2010.0
"Abbott, Lemuel Francis","Henry Byne, of Carshalton",1885.0
"Abrahams, Ivor",A Dream Within a Dream,1975.0
Absalon,Assassinations,1997.0
"Abts, Tomma",Noeme,2006.0
"Acconci, Vito",3 Flags for 1 Space and 6 Regions,1982.0
"Ackling, Roger",Five Sunsets in One Hour,1983.0


In [15]:
# Filter
grouped_titles = df.groupby('title')
title_counts = grouped_titles.size().sort_values(ascending=False)

In [16]:
condition = lambda x: len(x.index) > 1
dup_titles_df = grouped_titles.filter(condition)
dup_titles_df.sort_values('title', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
