## One Hundred Pandas Problems
From: https://github.com/ajcr/100-pandas-puzzles/blob/master/100-pandas-puzzles-with-solutions.ipynb

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.__version__

'0.24.2'

In [3]:
pd.show_versions()


INSTALLED VERSIONS
------------------
commit: None
python: 3.7.3.final.0
python-bits: 64
OS: Linux
OS-release: 5.0.0-37-generic
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
LOCALE: en_US.UTF-8

pandas: 0.24.2
pytest: 4.3.1
pip: 19.0.3
setuptools: 40.8.0
Cython: 0.29.6
numpy: 1.16.2
scipy: 1.2.1
pyarrow: None
xarray: None
IPython: 7.4.0
sphinx: 1.8.5
patsy: 0.5.1
dateutil: 2.8.0
pytz: 2018.9
blosc: None
bottleneck: 1.2.1
tables: 3.5.1
numexpr: 2.6.9
feather: None
matplotlib: 3.0.3
openpyxl: 2.6.1
xlrd: 1.2.0
xlwt: 1.3.0
xlsxwriter: 1.1.5
lxml.etree: 4.3.2
bs4: 4.7.1
html5lib: 1.0.1
sqlalchemy: 1.3.1
pymysql: None
psycopg2: 2.7.6.1 (dt dec pq3 ext lo64)
jinja2: 2.10
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: None
gcsfs: None


In [4]:
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

In [5]:
df = pd.DataFrame(data, index=labels)

In [6]:
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 4 columns):
animal      10 non-null object
age         8 non-null float64
visits      10 non-null int64
priority    10 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 400.0+ bytes


In [8]:
df.describe()

Unnamed: 0,age,visits
count,8.0,10.0
mean,3.4375,1.9
std,2.007797,0.875595
min,0.5,1.0
25%,2.375,1.0
50%,3.0,2.0
75%,4.625,2.75
max,7.0,3.0


In [9]:
df.iloc[:3] # iloc i for index, which is a-j in this df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no


In [10]:
df.head(3)

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no


In [11]:
df.loc[:, ['animal', 'age']] # selects animal and age columns

Unnamed: 0,animal,age
a,cat,2.5
b,cat,3.0
c,snake,0.5
d,dog,
e,dog,5.0
f,cat,2.0
g,snake,4.5
h,cat,
i,dog,7.0
j,dog,3.0


In [12]:
df[['animal', 'age']] # same as above

Unnamed: 0,animal,age
a,cat,2.5
b,cat,3.0
c,snake,0.5
d,dog,
e,dog,5.0
f,cat,2.0
g,snake,4.5
h,cat,
i,dog,7.0
j,dog,3.0


In [13]:
df.loc[df.index[[3,4,8]], ['animal', 'age']] # rows 3, 4, 8, cols animal and age

Unnamed: 0,animal,age
d,dog,
e,dog,5.0
i,dog,7.0


In [14]:
df[df['visits'] > 3] # returns where visits > 3. which is none

Unnamed: 0,animal,age,visits,priority


In [15]:
df[df['age'].isnull()] # returns rows where age is null

Unnamed: 0,animal,age,visits,priority
d,dog,,3,yes
h,cat,,1,yes


In [16]:
df[(df['animal'] == 'cat') & (df['age'] < 3)]

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
f,cat,2.0,3,no


In [17]:
df[df['age'].between(2,4)] # age betwen 2 and 4 inclusive

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
f,cat,2.0,3,no
j,dog,3.0,1,no


In [18]:
df.loc['f','age'] = 1.5 # changes f row and age to 1.5
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,1.5,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [19]:
df['visits'].sum() # adds up visits

19

In [20]:
df.groupby('animal')['age'].mean() # groups animal categories and finds mean of each category

animal
cat      2.333333
dog      5.000000
snake    2.500000
Name: age, dtype: float64

In [21]:
df.loc['k'] = [5.5, 'dg', 'no', 2] # adds entire row of k 

In [22]:
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5,2,no
f,cat,1.5,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7,2,no
j,dog,3,1,no


In [23]:
df.drop('k') # drops row k

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,1.5,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [24]:
df['animal'].value_counts() # counts frequency of each animal 

dog      4
cat      4
snake    2
5.5      1
Name: animal, dtype: int64

In [25]:
df.sort_values(by=['age', 'visits'], ascending=[False, True])

Unnamed: 0,animal,age,visits,priority
k,5.5,dg,no,2
i,dog,7,2,no
e,dog,5,2,no
g,snake,4.5,1,no
j,dog,3,1,no
b,cat,3,3,yes
a,cat,2.5,1,yes
f,cat,1.5,3,no
c,snake,0.5,2,no
h,cat,,1,yes


In [26]:
df['priority'] = df['priority'].map({'yes': True, 'no': False})

In [27]:
%%timeit
df['animal'] = df['animal'].replace('snake', 'python')

402 µs ± 27.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [28]:
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,True
b,cat,3,3,True
c,python,0.5,2,False
d,dog,,3,True
e,dog,5,2,False
f,cat,1.5,3,False
g,python,4.5,1,False
h,cat,,1,True
i,dog,7,2,False
j,dog,3,1,False


In [30]:
# df.pivot_table(index='animal', columns='visits', values='age', aggfunc='mean')

In [43]:
df = pd.DataFrame({'A': [1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7]})

In [44]:
df

Unnamed: 0,A
0,1
1,2
2,2
3,3
4,4
5,5
6,5
7,5
8,6
9,7


In [45]:
df.loc[df['A'].shift() != df['A']] # drops duplicates 

Unnamed: 0,A
0,1
1,2
3,3
4,4
5,5
8,6
9,7


In [46]:
df.drop_duplicates(subset='A') # drops duplicates

Unnamed: 0,A
0,1
1,2
3,3
4,4
5,5
8,6
9,7


In [35]:
df = pd.DataFrame(np.random.random(size=(5,3)))
df

Unnamed: 0,0,1,2
0,0.101893,0.065434,0.431244
1,0.044797,0.133144,0.245838
2,0.725947,0.104563,0.449826
3,0.21216,0.886005,0.22284
4,0.311381,0.73018,0.582358


In [36]:
df.sub(df.mean(axis=1), axis=0) # subtact mean from each element

Unnamed: 0,0,1,2
0,-0.097631,-0.13409,0.23172
1,-0.096463,-0.008116,0.104579
2,0.299168,-0.322216,0.023047
3,-0.228175,0.44567,-0.217495
4,-0.229926,0.188874,0.041052


In [38]:
df.add(df.mean(axis=1), axis=0) # add mean to each element

Unnamed: 0,0,1,2
0,0.301417,0.264958,0.630768
1,0.186057,0.274404,0.387098
2,1.152726,0.531341,0.876605
3,0.652495,1.32634,0.663175
4,0.852687,1.271486,1.123665


In [39]:
df = pd.DataFrame(np.random.random(size=(5,10)), columns=list('abcdefghij'))
df

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
0,0.033698,0.451989,0.856553,0.656887,0.697992,0.933534,0.885163,0.705983,0.459905,0.0531
1,0.119134,0.004515,0.510125,0.398857,0.956061,0.509346,0.333682,0.281161,0.686618,0.392659
2,0.072417,0.927617,0.378037,0.991758,0.019149,0.180673,0.674109,0.943422,0.507744,0.254695
3,0.290766,0.966343,0.598749,0.038079,0.158498,0.592067,0.98948,0.642332,0.276079,0.007372
4,0.30541,0.444487,0.360054,0.025641,0.954633,0.098761,0.961091,0.198314,0.304117,0.444953


In [41]:
df.sum().idxmin() # returns the column name of the smallest sum

'a'

In [48]:
df = pd.DataFrame({'A': [1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7]})
df

Unnamed: 0,A
0,1
1,2
2,2
3,3
4,4
5,5
6,5
7,5
8,6
9,7


In [50]:
len(df) - df.duplicated(keep=False).sum() # counts number of duplicates

4

In [52]:
df.A.value_counts() # can also see the summary of values

5    3
7    2
2    2
6    1
4    1
3    1
1    1
Name: A, dtype: int64

In [53]:
len(df.drop_duplicates(keep=False)) # counts number of duplicates

4

In [54]:
df = pd.DataFrame({'A': [1, np.nan, 2, np.nan, 4, 5, np.nan, 5, np.nan, 7, np.nan]
                  , 'B': [np.nan, np.nan, 2, np.nan, 4, np.nan, 5, 5, 6, np.nan, 7]
                  , 'C': [np.nan, 2, 2, np.nan, 4, 5, np.nan, 5, np.nan, np.nan, 7]
                  , 'D': [1, np.nan, 2, np.nan, 4, 5, np.nan, np.nan, 6, 7, np.nan]
                  , 'E': [1, 2, np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 7]
                  , 'F': [np.nan, np.nan, np.nan, np.nan, np.nan, 5, 5, 5, 6, 7, 7]
                  , 'G': [1, np.nan, np.nan, 3, 4, 5, 5, 5, np.nan, np.nan, np.nan]
                  , 'H': [1, np.nan, 2, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 7]
                  , 'I': [1, np.nan, np.nan, np.nan, 4, 5, 5, 5, 6, np.nan, np.nan]
                  , 'J': [1, 2, np.nan, 3, 4, np.nan, 5, np.nan, np.nan, 7, np.nan]})
df

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,1.0,,,1.0,1.0,,1.0,1.0,1.0,1.0
1,,,2.0,,2.0,,,,,2.0
2,2.0,2.0,2.0,2.0,,,,2.0,,
3,,,,,,,3.0,,,3.0
4,4.0,4.0,4.0,4.0,,,4.0,,4.0,4.0
5,5.0,,5.0,5.0,,5.0,5.0,,5.0,
6,,5.0,,,,5.0,5.0,,5.0,5.0
7,5.0,5.0,5.0,,5.0,5.0,5.0,5.0,5.0,
8,,6.0,,6.0,6.0,6.0,,6.0,6.0,
9,7.0,,,7.0,7.0,7.0,,7.0,,7.0


In [56]:
(df.isnull().cumsum(axis=1) == 3).idxmax(axis=1) # finds the 3rd NaN value of each row

0     F
1     D
2     G
3     C
4     H
5     H
6     D
7     A
8     G
9     G
10    G
dtype: object

In [58]:
df = pd.DataFrame({'grps': list('aaabbcaabcccbbc'), 
                   'vals': [12,345,3,1,45,14,4,52,54,23,235,21,57,3,87]})
df

Unnamed: 0,grps,vals
0,a,12
1,a,345
2,a,3
3,b,1
4,b,45
5,c,14
6,a,4
7,a,52
8,b,54
9,c,23


In [60]:
df.groupby('grps')['vals'].nlargest(3).sum(level=0) # funds the sum of the 3 largest values in each group

grps
a    409
b    156
c    345
Name: vals, dtype: int64

In [76]:
df = pd.DataFrame({'A':np.arange(0,101)
                  , 'B':np.random.randint(0,2,101)})
df

Unnamed: 0,A,B
0,0,1
1,1,0
2,2,0
3,3,1
4,4,0
5,5,0
6,6,0
7,7,1
8,8,1
9,9,1
