# Fire up pandas, numpy, and matplotlib

We always start with these lines before using the open-source Python scientific stack

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



# Load a tabular data set

In [2]:
df = pd.read_csv('people-example.csv')

# DataFrame basics

In [3]:
df.head() #we can view first few lines of table

Unnamed: 0,First Name,Last Name,Country,age
0,Bob,Smith,United States,24
1,Alice,Williams,Canada,23
2,Malcolm,Jone,England,22
3,Felix,Brown,USA,23
4,Alex,Cooper,Poland,23


In [4]:
df.tail()  # view end of the table

Unnamed: 0,First Name,Last Name,Country,age
2,Malcolm,Jone,England,22
3,Felix,Brown,USA,23
4,Alex,Cooper,Poland,23
5,Tod,Campbell,United States,22
6,Derek,Ward,Switzerland,25


# Visualizing DataFrame Contents

In [5]:
# Look at the contents as a table
df

Unnamed: 0,First Name,Last Name,Country,age
0,Bob,Smith,United States,24
1,Alice,Williams,Canada,23
2,Malcolm,Jone,England,22
3,Felix,Brown,USA,23
4,Alex,Cooper,Poland,23
5,Tod,Campbell,United States,22
6,Derek,Ward,Switzerland,25


In [6]:
# If you want Matplotlib visualization to show up on this notebook, 
# rather than popping up a new window, add this line:
%matplotlib notebook

In [7]:
# Show a statistical summary for all numerical (i.e. non-categorical) columns
df.describe()

Unnamed: 0,age
count,7.0
mean,23.142857
std,1.069045
min,22.0
25%,22.5
50%,23.0
75%,23.5
max,25.0


In [8]:
# Show brief statistical summary for a categorical column
df['Country'].describe()

count                 7
unique                6
top       United States
freq                  2
Name: Country, dtype: object

# Inspect columns of dataset

In [9]:
df['Country']

0    United States
1           Canada
2          England
3              USA
4           Poland
5    United States
6      Switzerland
Name: Country, dtype: object

In [10]:
df['age']

0    24
1    23
2    22
3    23
4    23
5    22
6    25
Name: age, dtype: int64

Some simple columnar operations

In [11]:
df['age'].mean()

23.142857142857142

In [12]:
df['age'].max()

25

# Create new columns in our DataFrame

In [13]:
df

Unnamed: 0,First Name,Last Name,Country,age
0,Bob,Smith,United States,24
1,Alice,Williams,Canada,23
2,Malcolm,Jone,England,22
3,Felix,Brown,USA,23
4,Alex,Cooper,Poland,23
5,Tod,Campbell,United States,22
6,Derek,Ward,Switzerland,25


In [14]:
df['Full Name'] = df['First Name'] + ' ' + df['Last Name']

In [15]:
df

Unnamed: 0,First Name,Last Name,Country,age,Full Name
0,Bob,Smith,United States,24,Bob Smith
1,Alice,Williams,Canada,23,Alice Williams
2,Malcolm,Jone,England,22,Malcolm Jone
3,Felix,Brown,USA,23,Felix Brown
4,Alex,Cooper,Poland,23,Alex Cooper
5,Tod,Campbell,United States,22,Tod Campbell
6,Derek,Ward,Switzerland,25,Derek Ward


In [16]:
df['age'] * df['age']

0    576
1    529
2    484
3    529
4    529
5    484
6    625
Name: age, dtype: int64

# Use the apply function to do a advance transformation of our data

In [17]:
df['Country']

0    United States
1           Canada
2          England
3              USA
4           Poland
5    United States
6      Switzerland
Name: Country, dtype: object

In [18]:
df['Country'].describe()

count                 7
unique                6
top       United States
freq                  2
Name: Country, dtype: object

In [19]:
def transform_country(country):
    if country == 'USA':
        return 'United States'
    else:
        return country

In [20]:
transform_country('Brazil')

'Brazil'

In [21]:
transform_country('Brasil')

'Brasil'

In [22]:
transform_country('USA')

'United States'

In [23]:
df['Country'].apply(transform_country)

0    United States
1           Canada
2          England
3    United States
4           Poland
5    United States
6      Switzerland
Name: Country, dtype: object

In [24]:
df

Unnamed: 0,First Name,Last Name,Country,age,Full Name
0,Bob,Smith,United States,24,Bob Smith
1,Alice,Williams,Canada,23,Alice Williams
2,Malcolm,Jone,England,22,Malcolm Jone
3,Felix,Brown,USA,23,Felix Brown
4,Alex,Cooper,Poland,23,Alex Cooper
5,Tod,Campbell,United States,22,Tod Campbell
6,Derek,Ward,Switzerland,25,Derek Ward


In [25]:
df['Country'] = df['Country'].apply(transform_country)

In [26]:
df

Unnamed: 0,First Name,Last Name,Country,age,Full Name
0,Bob,Smith,United States,24,Bob Smith
1,Alice,Williams,Canada,23,Alice Williams
2,Malcolm,Jone,England,22,Malcolm Jone
3,Felix,Brown,United States,23,Felix Brown
4,Alex,Cooper,Poland,23,Alex Cooper
5,Tod,Campbell,United States,22,Tod Campbell
6,Derek,Ward,Switzerland,25,Derek Ward
