# EDA with Pandas

In [None]:
import matplotlib.pyplot as plt
import pandas as pd


### Creating a `DataFrame` from a `csv` file

In [None]:
df = pd.read_csv("data/SwissMunicipalities.csv")


### Explore

In [None]:
df.head()


In [None]:
df.info()


In [None]:
df["Area"].plot();


In [None]:
plt.plot(df.Area.sort_values().values)


In [None]:
df.hist(bins=20, figsize=(12, 6));


### Filtering (conditional distributions)

In [None]:
sel = df["Canton"] == "LU"
sel


In [None]:
df[sel]


### Sorting

In [None]:
df.sort_values('Area', ascending=False).head()


In [None]:
df[df.Canton == "LU"].sort_values('Area', ascending=False).head()


### Statistics

In [None]:
df["Population"].sum()


In [None]:
sel = df["Canton"] == "LU"
df[sel]["Population"].sum()


### Grouping

In [None]:
df.head()


In [None]:
# Determine the number of municipalities per canton
df.groupby("Canton").count()[['Municipality']].head(12)


In [None]:
# Determine the largest municipality in each canton
df.sort_values(['Area'], ascending=False).groupby("Canton").first().head(12)


In [None]:
# Group by canton and select Lucerne
df.groupby("Canton").get_group("LU")


In [None]:
# Find population in canton of Lucerne
df.groupby("Canton").get_group("LU")["Population"].sum()


In [None]:
# Find ten largest cantons by population
df.groupby("Canton").sum(numeric_only=True)[["Population"]].sort_values("Population", ascending=False).head(10)


In [None]:
%matplotlib inline
fig, ax = plt.subplots(figsize=(12, 5))
ax.xaxis.set_major_locator(plt.MaxNLocator(26))
df.groupby("Canton") \
    .sum(numeric_only=True)['Population'] \
    .sort_values(ascending=False) \
    .plot.bar(zorder=2)
plt.grid(True)
plt.title('Population per Canton')
