In [1]:
# This is my attempt to solve one hundred pandas puzzles
# Following these exercises here: https://github.com/ajcr/100-pandas-puzzles/blob/master/100-pandas-puzzles.ipynb
# Attempting all exercises before consulting any solutions

# Import pandas under the alias pd

In [2]:
import pandas as pd

# Print the version of pandas that is being used

In [5]:
pd.__version__

'0.25.1'

# Print out all the version information of the libraries that are required by the pandas library.

In [7]:
pd.show_versions()


INSTALLED VERSIONS
------------------
commit           : None
python           : 3.7.3.final.0
python-bits      : 64
OS               : Darwin
OS-release       : 19.6.0
machine          : x86_64
processor        : i386
byteorder        : little
LC_ALL           : None
LANG             : en_US.UTF-8
LOCALE           : en_US.UTF-8

pandas           : 0.25.1
numpy            : 1.17.1
pytz             : 2019.1
dateutil         : 2.8.0
pip              : 19.2.3
setuptools       : 40.8.0
Cython           : None
pytest           : None
hypothesis       : None
sphinx           : None
blosc            : None
feather          : None
xlsxwriter       : None
lxml.etree       : None
html5lib         : None
pymysql          : None
psycopg2         : None
jinja2           : 2.10.3
IPython          : 7.7.0
pandas_datareader: None
bs4              : None
bottleneck       : None
fastparquet      : None
gcsfs            : None
lxml.etree       : None
matplotlib       : 3.0.3
numexpr          : None
odfp

# Dataframe basics

In [8]:
# Importing numpy
import numpy as np

# Create dataframe from 'data' dictionary and use the 'labels' as the index 

In [9]:
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

In [10]:
df = pd.DataFrame(data, index=labels)

In [11]:
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


# Display a summary of the basic information about this DataFrame and its data 

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 4 columns):
animal      10 non-null object
age         8 non-null float64
visits      10 non-null int64
priority    10 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 400.0+ bytes


# Return the first 3 rows of the DataFrame df.

In [15]:
df.iloc[0:3]

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no


# Select just the 'animal' and 'age' columns from the DataFrame df.

In [20]:
df[["animal", "age"]]

Unnamed: 0,animal,age
a,cat,2.5
b,cat,3.0
c,snake,0.5
d,dog,
e,dog,5.0
f,cat,2.0
g,snake,4.5
h,cat,
i,dog,7.0
j,dog,3.0


# Select the data in rows [3, 4, 8] and in columns ['animal', 'age'].

In [25]:
df.iloc[[3, 4, 8]][["animal", "age"]]

Unnamed: 0,animal,age
d,dog,
e,dog,5.0
i,dog,7.0


# Select only the rows where the number of visits is greater than 3.

In [28]:
greater_than_three = df[df["visits"] > 3]
greater_than_three

Unnamed: 0,animal,age,visits,priority


# Select the rows where the age is missing, i.e. it is NaN.

In [30]:
missing_ages = df[df["age"].isnull()]
missing_ages

Unnamed: 0,animal,age,visits,priority
d,dog,,3,yes
h,cat,,1,yes


# Select the rows where the animal is a cat and the age is less than 3.

In [35]:
filt = (df["animal"] == "cat")  & (df["age"] < 3)
df[filt]

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
f,cat,2.0,3,no


#  Select the rows the age is between 2 and 4 (inclusive).

In [42]:
filt =  ((df["age"] >= 2) & (df["age"] <=4))
df[filt]

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
f,cat,2.0,3,no
j,dog,3.0,1,no


# Change the age in row 'f' to 1.5.

In [48]:
df.loc["f", "age"] = [1.5]
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,1.5,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


#  Calculate the sum of all visits in df (i.e. find the total number of visits).

In [50]:
total_visits = df["visits"].sum()
print(f"Sum of all visits is {total_visits}")

Sum of all visits is 19


# Calculate the mean age for each different animal in df.

In [51]:
df.groupby("animal")["age"].mean()

animal
cat      2.333333
dog      5.000000
snake    2.500000
Name: age, dtype: float64