## One Hundred Pandas Problems
From: https://github.com/ajcr/100-pandas-puzzles/blob/master/100-pandas-puzzles-with-solutions.ipynb

In [41]:
import pandas as pd
import numpy as np

In [42]:
pd.__version__

'0.25.1'

In [43]:
pd.show_versions()


INSTALLED VERSIONS
------------------
commit           : None
python           : 3.7.4.final.0
python-bits      : 64
OS               : Linux
OS-release       : 5.0.0-37-generic
machine          : x86_64
processor        : x86_64
byteorder        : little
LC_ALL           : None
LANG             : en_US.UTF-8
LOCALE           : en_US.UTF-8

pandas           : 0.25.1
numpy            : 1.17.2
pytz             : 2019.3
dateutil         : 2.8.0
pip              : 19.2.3
setuptools       : 41.4.0
Cython           : 0.29.13
pytest           : 5.2.1
hypothesis       : None
sphinx           : 2.2.0
blosc            : None
feather          : None
xlsxwriter       : 1.2.1
lxml.etree       : 4.4.1
html5lib         : 1.0.1
pymysql          : None
psycopg2         : 2.8.4 (dt dec pq3 ext lo64)
jinja2           : 2.10.3
IPython          : 7.8.0
pandas_datareader: None
bs4              : 4.8.0
bottleneck       : 1.2.1
fastparquet      : None
gcsfs            : None
lxml.etree       : 4.4.1
matplotl

In [44]:
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

In [45]:
df = pd.DataFrame(data, index=labels)

In [46]:
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 4 columns):
animal      10 non-null object
age         8 non-null float64
visits      10 non-null int64
priority    10 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 400.0+ bytes


In [48]:
df.describe()

Unnamed: 0,age,visits
count,8.0,10.0
mean,3.4375,1.9
std,2.007797,0.875595
min,0.5,1.0
25%,2.375,1.0
50%,3.0,2.0
75%,4.625,2.75
max,7.0,3.0


In [49]:
df.iloc[:3] # iloc i for index, which is a-j in this df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no


In [50]:
df.head(3)

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no


In [51]:
df.loc[:, ['animal', 'age']] # selects animal and age columns

Unnamed: 0,animal,age
a,cat,2.5
b,cat,3.0
c,snake,0.5
d,dog,
e,dog,5.0
f,cat,2.0
g,snake,4.5
h,cat,
i,dog,7.0
j,dog,3.0


In [52]:
df[['animal', 'age']] # same as above

Unnamed: 0,animal,age
a,cat,2.5
b,cat,3.0
c,snake,0.5
d,dog,
e,dog,5.0
f,cat,2.0
g,snake,4.5
h,cat,
i,dog,7.0
j,dog,3.0


In [53]:
df.loc[df.index[[3,4,8]], ['animal', 'age']] # rows 3, 4, 8, cols animal and age

Unnamed: 0,animal,age
d,dog,
e,dog,5.0
i,dog,7.0


In [54]:
df[df['visits'] > 3] # returns where visits > 3. which is none

Unnamed: 0,animal,age,visits,priority


In [55]:
df[df['age'].isnull()] # returns rows where age is null

Unnamed: 0,animal,age,visits,priority
d,dog,,3,yes
h,cat,,1,yes


In [56]:
df[(df['animal'] == 'cat') & (df['age'] < 3)]

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
f,cat,2.0,3,no


In [57]:
df[df['age'].between(2,4)] # age betwen 2 and 4 inclusive

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
f,cat,2.0,3,no
j,dog,3.0,1,no


In [58]:
df.loc['f','age'] = 1.5 # changes f row and age to 1.5
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,1.5,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [59]:
df['visits'].sum() # adds up visits

19

In [60]:
df.groupby('animal')['age'].mean() # groups animal categories and finds mean of each category

animal
cat      2.333333
dog      5.000000
snake    2.500000
Name: age, dtype: float64

In [61]:
df.loc['k'] = [5.5, 'dg', 'no', 2] # adds entire row of k 

In [62]:
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5,2,no
f,cat,1.5,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7,2,no
j,dog,3,1,no


In [63]:
df.drop('k') # drops row k

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,1.5,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [64]:
df['animal'].value_counts() # counts frequency of each animal 

cat      4
dog      4
snake    2
5.5      1
Name: animal, dtype: int64

In [65]:
df.sort_values(by=['age', 'visits'], ascending=[False, True])

Unnamed: 0,animal,age,visits,priority
k,5.5,dg,no,2
i,dog,7,2,no
e,dog,5,2,no
g,snake,4.5,1,no
j,dog,3,1,no
b,cat,3,3,yes
a,cat,2.5,1,yes
f,cat,1.5,3,no
c,snake,0.5,2,no
h,cat,,1,yes


In [66]:
df['priority'] = df['priority'].map({'yes': True, 'no': False})

In [67]:
%%timeit
df['animal'] = df['animal'].replace('snake', 'python')

352 µs ± 8.45 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [68]:
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,True
b,cat,3,3,True
c,python,0.5,2,False
d,dog,,3,True
e,dog,5,2,False
f,cat,1.5,3,False
g,python,4.5,1,False
h,cat,,1,True
i,dog,7,2,False
j,dog,3,1,False


In [69]:
# df.pivot_table(index='animal', columns='visits', values='age', aggfunc='mean')

In [70]:
df = pd.DataFrame({'A': [1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7]})

In [71]:
df

Unnamed: 0,A
0,1
1,2
2,2
3,3
4,4
5,5
6,5
7,5
8,6
9,7


In [72]:
df.loc[df['A'].shift() != df['A']] # drops duplicates 

Unnamed: 0,A
0,1
1,2
3,3
4,4
5,5
8,6
9,7


In [73]:
df.drop_duplicates(subset='A') # drops duplicates

Unnamed: 0,A
0,1
1,2
3,3
4,4
5,5
8,6
9,7


In [74]:
df = pd.DataFrame(np.random.random(size=(5,3)))
df

Unnamed: 0,0,1,2
0,0.598348,0.202794,0.669614
1,0.591426,0.100841,0.578593
2,0.448106,0.905258,0.022538
3,0.229289,0.354753,0.687044
4,0.968,0.762518,0.139556


In [75]:
df.sub(df.mean(axis=1), axis=0) # subtact mean from each element

Unnamed: 0,0,1,2
0,0.108096,-0.287458,0.179362
1,0.167806,-0.322779,0.154973
2,-0.010528,0.446624,-0.436096
3,-0.194406,-0.068942,0.263349
4,0.344642,0.13916,-0.483802


In [76]:
df.add(df.mean(axis=1), axis=0) # add mean to each element

Unnamed: 0,0,1,2
0,1.0886,0.693046,1.159867
1,1.015045,0.52446,1.002213
2,0.90674,1.363892,0.481172
3,0.652985,0.778449,1.11074
4,1.591358,1.385876,0.762914


In [77]:
df = pd.DataFrame(np.random.random(size=(5,10)), columns=list('abcdefghij'))
df

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
0,0.840817,0.151672,0.399958,0.012518,0.785089,0.369122,0.916691,0.171167,0.953421,0.944986
1,0.338204,0.917754,0.019356,0.965859,0.859548,0.7594,0.634011,0.526488,0.619254,0.568569
2,0.965345,0.298566,0.772932,0.992883,0.669145,0.749555,0.721899,0.634939,0.389176,0.48437
3,0.269425,0.31463,0.11148,0.31944,0.845554,0.463271,0.793363,0.271943,0.161893,0.117232
4,0.643009,0.619941,0.053484,0.266994,0.159949,0.532556,0.483337,0.648224,0.51765,0.929389


In [78]:
df.sum().idxmin() # returns the column name of the smallest sum

'c'

In [79]:
df = pd.DataFrame({'A': [1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7]})
df

Unnamed: 0,A
0,1
1,2
2,2
3,3
4,4
5,5
6,5
7,5
8,6
9,7


In [80]:
len(df) - df.duplicated(keep=False).sum() # counts number of duplicates

4

In [81]:
df.A.value_counts() # can also see the summary of values

5    3
7    2
2    2
6    1
4    1
3    1
1    1
Name: A, dtype: int64

In [82]:
len(df.drop_duplicates(keep=False)) # counts number of duplicates

4

In [83]:
df = pd.DataFrame({'A': [1, np.nan, 2, np.nan, 4, 5, np.nan, 5, np.nan, 7, np.nan]
                  , 'B': [np.nan, np.nan, 2, np.nan, 4, np.nan, 5, 5, 6, np.nan, 7]
                  , 'C': [np.nan, 2, 2, np.nan, 4, 5, np.nan, 5, np.nan, np.nan, 7]
                  , 'D': [1, np.nan, 2, np.nan, 4, 5, np.nan, np.nan, 6, 7, np.nan]
                  , 'E': [1, 2, np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 7]
                  , 'F': [np.nan, np.nan, np.nan, np.nan, np.nan, 5, 5, 5, 6, 7, 7]
                  , 'G': [1, np.nan, np.nan, 3, 4, 5, 5, 5, np.nan, np.nan, np.nan]
                  , 'H': [1, np.nan, 2, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 7]
                  , 'I': [1, np.nan, np.nan, np.nan, 4, 5, 5, 5, 6, np.nan, np.nan]
                  , 'J': [1, 2, np.nan, 3, 4, np.nan, 5, np.nan, np.nan, 7, np.nan]})
df

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,1.0,,,1.0,1.0,,1.0,1.0,1.0,1.0
1,,,2.0,,2.0,,,,,2.0
2,2.0,2.0,2.0,2.0,,,,2.0,,
3,,,,,,,3.0,,,3.0
4,4.0,4.0,4.0,4.0,,,4.0,,4.0,4.0
5,5.0,,5.0,5.0,,5.0,5.0,,5.0,
6,,5.0,,,,5.0,5.0,,5.0,5.0
7,5.0,5.0,5.0,,5.0,5.0,5.0,5.0,5.0,
8,,6.0,,6.0,6.0,6.0,,6.0,6.0,
9,7.0,,,7.0,7.0,7.0,,7.0,,7.0


In [84]:
(df.isnull().cumsum(axis=1) == 3).idxmax(axis=1) # finds the 3rd NaN value of each row

0     F
1     D
2     G
3     C
4     H
5     H
6     D
7     A
8     G
9     G
10    G
dtype: object

In [85]:
df = pd.DataFrame({'grps': list('aaabbcaabcccbbc'), 
                   'vals': [12,345,3,1,45,14,4,52,54,23,235,21,57,3,87]})
df

Unnamed: 0,grps,vals
0,a,12
1,a,345
2,a,3
3,b,1
4,b,45
5,c,14
6,a,4
7,a,52
8,b,54
9,c,23


In [86]:
df.groupby('grps')['vals'].nlargest(3).sum(level=0) # funds the sum of the 3 largest values in each group

grps
a    409
b    156
c    345
Name: vals, dtype: int64

In [87]:
df = pd.DataFrame({'A':np.arange(0,101)
                  , 'B':np.random.randint(0,2,101)})
df

Unnamed: 0,A,B
0,0,0
1,1,1
2,2,0
3,3,0
4,4,0
...,...,...
96,96,0
97,97,0
98,98,1
99,99,1


In [92]:
df.groupby(pd.cut(df['A'], np.arange(0,101,10)))['B'].sum() # sum B by consecutive 10's

A
(0, 10]      3
(10, 20]     7
(20, 30]     6
(30, 40]     4
(40, 50]     6
(50, 60]     7
(60, 70]     6
(70, 80]     3
(80, 90]     6
(90, 100]    7
Name: B, dtype: int64

In [93]:
df.groupby(pd.cut(df['A'], np.arange(0,101,5)))['B'].sum() # sum B by consecutive 5's

A
(0, 5]       2
(5, 10]      1
(10, 15]     3
(15, 20]     4
(20, 25]     2
(25, 30]     4
(30, 35]     3
(35, 40]     1
(40, 45]     3
(45, 50]     3
(50, 55]     4
(55, 60]     3
(60, 65]     2
(65, 70]     4
(70, 75]     0
(75, 80]     3
(80, 85]     2
(85, 90]     4
(90, 95]     4
(95, 100]    3
Name: B, dtype: int64

In [96]:
df = pd.DataFrame({'From_To': ['LoNDon_paris', 'MAdrid_miLAN', 'londON_StockhOlm', 
                               'Budapest_PaRis', 'Brussels_londOn'],
              'FlightNumber': [10045, np.nan, 10065, np.nan, 10085],
              'RecentDelays': [[23, 47], [], [24, 43, 87], [13], [67, 32]],
                   'Airline': ['KLM(!)', '<Air France> (12)', '(British Airways. )', 
                               '12. Air France', '"Swiss Air"']})

In [97]:
df

Unnamed: 0,From_To,FlightNumber,RecentDelays,Airline
0,LoNDon_paris,10045.0,"[23, 47]",KLM(!)
1,MAdrid_miLAN,,[],<Air France> (12)
2,londON_StockhOlm,10065.0,"[24, 43, 87]",(British Airways. )
3,Budapest_PaRis,,[13],12. Air France
4,Brussels_londOn,10085.0,"[67, 32]","""Swiss Air"""


In [98]:
df['FlightNumber'] = df['FlightNumber'].interpolate().astype(int)

In [99]:
df

Unnamed: 0,From_To,FlightNumber,RecentDelays,Airline
0,LoNDon_paris,10045,"[23, 47]",KLM(!)
1,MAdrid_miLAN,10055,[],<Air France> (12)
2,londON_StockhOlm,10065,"[24, 43, 87]",(British Airways. )
3,Budapest_PaRis,10075,[13],12. Air France
4,Brussels_londOn,10085,"[67, 32]","""Swiss Air"""


In [100]:
temp = df.From_To.str.split('_', expand=True)
temp.columns = ['From', 'To']

In [101]:
temp

Unnamed: 0,From,To
0,LoNDon,paris
1,MAdrid,miLAN
2,londON,StockhOlm
3,Budapest,PaRis
4,Brussels,londOn


In [102]:
df = df.drop('From_To', axis = 1)
df = df.join(temp)
df

Unnamed: 0,FlightNumber,RecentDelays,Airline,From,To
0,10045,"[23, 47]",KLM(!),LoNDon,paris
1,10055,[],<Air France> (12),MAdrid,miLAN
2,10065,"[24, 43, 87]",(British Airways. ),londON,StockhOlm
3,10075,[13],12. Air France,Budapest,PaRis
4,10085,"[67, 32]","""Swiss Air""",Brussels,londOn


In [103]:
df['Airline'] = df['Airline'].str.extract('([a-zA-Z\s]+)', expand=False).str.strip()
df

Unnamed: 0,FlightNumber,RecentDelays,Airline,From,To
0,10045,"[23, 47]",KLM,LoNDon,paris
1,10055,[],Air France,MAdrid,miLAN
2,10065,"[24, 43, 87]",British Airways,londON,StockhOlm
3,10075,[13],Air France,Budapest,PaRis
4,10085,"[67, 32]",Swiss Air,Brussels,londOn


In [104]:
delays = df['RecentDelays'].apply(pd.Series)

In [105]:
delays

Unnamed: 0,0,1,2
0,23.0,47.0,
1,,,
2,24.0,43.0,87.0
3,13.0,,
4,67.0,32.0,


In [106]:
delays.columns = ['delay_{}'.format(n) for n in range(1, len(delays.columns)+1)]

In [107]:
delays

Unnamed: 0,delay_1,delay_2,delay_3
0,23.0,47.0,
1,,,
2,24.0,43.0,87.0
3,13.0,,
4,67.0,32.0,


In [109]:
df.drop('RecentDelays', axis=1).join(delays)

Unnamed: 0,FlightNumber,Airline,From,To,delay_1,delay_2,delay_3
0,10045,KLM,LoNDon,paris,23.0,47.0,
1,10055,Air France,MAdrid,miLAN,,,
2,10065,British Airways,londON,StockhOlm,24.0,43.0,87.0
3,10075,Air France,Budapest,PaRis,13.0,,
4,10085,Swiss Air,Brussels,londOn,67.0,32.0,
