# 1. Pandas Basics

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
plt.style.use('ggplot')

> DataFrame is like a python dictionary

In [2]:
web_stats = { 'Day' : [1,2,3,4],
              'Visitors' : [43,53,34,45],
              'Bounce_rate' : [65,72,62,64]
            }
df = pd.DataFrame(web_stats)
df

Unnamed: 0,Bounce_rate,Day,Visitors
0,65,1,43
1,72,2,53
2,62,3,34
3,64,4,45


In [3]:
df.head(2)

Unnamed: 0,Bounce_rate,Day,Visitors
0,65,1,43
1,72,2,53


### Why do we have a index?

**How your data is related?** For example, time series data has time as index.

In [4]:
# setting index
df.set_index('Day')

Unnamed: 0_level_0,Bounce_rate,Visitors
Day,Unnamed: 1_level_1,Unnamed: 2_level_1
1,65,43
2,72,53
3,62,34
4,64,45


**In-place operation**

In [5]:
df.set_index('Day',inplace=True)
df

Unnamed: 0_level_0,Bounce_rate,Visitors
Day,Unnamed: 1_level_1,Unnamed: 2_level_1
1,65,43
2,72,53
3,62,34
4,64,45


### To reference a specific column

In [7]:
df['Visitors']

Day
1    43
2    53
3    34
4    45
Name: Visitors, dtype: int64

In [16]:
# also
df.Visitors

Day
1    43
2    53
3    34
4    45
Name: Visitors, dtype: int64

### Reference multiple columns

In [18]:
df[['Bounce_rate','Visitors']]

Unnamed: 0_level_0,Bounce_rate,Visitors
Day,Unnamed: 1_level_1,Unnamed: 2_level_1
1,65,43
2,72,53
3,62,34
4,64,45


### Convert DataFrame to list

In [19]:
df.Visitors.tolist()

[43, 53, 34, 45]

### Convert more than one column to a numpy array

In [20]:
import numpy as np
np.array(df[['Visitors', 'Bounce_rate']])

array([[43, 65],
       [53, 72],
       [34, 62],
       [45, 64]])

### Convert numpy array to DataFrame

In [21]:
pd.DataFrame(np.array([[1,2],[3,4]]))

Unnamed: 0,0,1
0,1,2
1,3,4


_____________________________________________________________________________________________

# 2. Pandas IO

In [34]:
df = pd.read_csv('../data/housing.csv')
df.head()

Unnamed: 0,Date,Value
0,2016-06-30,2.861789
1,2016-05-31,2.943012
2,2015-12-31,3.394777
3,2015-10-31,2.487805
4,2015-09-30,3.629032


In [26]:
# set index as Date
df.set_index('Date',inplace=True)

### Save to csv

In [27]:
df.to_csv('../data/date_indexed_housing.csv')

In [28]:
# read again
df = pd.read_csv('../data/date_indexed_housing.csv')
df

Unnamed: 0,Date,Value
0,2016-06-30,2.861789
1,2016-05-31,2.943012
2,2015-12-31,3.394777
3,2015-10-31,2.487805
4,2015-09-30,3.629032
5,2015-08-31,3.137682


### Set index column while reading

In [35]:
df = pd.read_csv('../data/date_indexed_housing.csv',index_col=0)
df

Unnamed: 0_level_0,Value
Date,Unnamed: 1_level_1
2016-06-30,2.861789
2016-05-31,2.943012
2015-12-31,3.394777
2015-10-31,2.487805
2015-09-30,3.629032
2015-08-31,3.137682


### Set column name

In [36]:
df.columns = ['Home Price Index']
df

Unnamed: 0_level_0,Home Price Index
Date,Unnamed: 1_level_1
2016-06-30,2.861789
2016-05-31,2.943012
2015-12-31,3.394777
2015-10-31,2.487805
2015-09-30,3.629032
2015-08-31,3.137682


In [37]:
# save again
df.to_csv('../data/new_data.csv')

In [38]:
# save without column header names
df.to_csv('../data/new_data_no_header.csv', header=False)

### Read a file without column names and manually insert column names

In [40]:
df = pd.read_csv('../data/new_data_no_header.csv',names=['Date','House Pricing'], index_col=0) # set index also
df

Unnamed: 0_level_0,House Pricing
Date,Unnamed: 1_level_1
2016-06-30,2.861789
2016-05-31,2.943012
2015-12-31,3.394777
2015-10-31,2.487805
2015-09-30,3.629032
2015-08-31,3.137682


### Convert to HTML

In [43]:
df.to_html('../data/housing.html')

### Remove index

In [45]:
df.reset_index(inplace=True)
df

Unnamed: 0,Date,House Pricing
0,2016-06-30,2.861789
1,2016-05-31,2.943012
2,2015-12-31,3.394777
3,2015-10-31,2.487805
4,2015-09-30,3.629032
5,2015-08-31,3.137682


### Rename columns

In [47]:
df.rename( columns= {'Housing Pricing' : 'HPI'}, inplace=True)

In [48]:
df

Unnamed: 0,Date,House Pricing
0,2016-06-30,2.861789
1,2016-05-31,2.943012
2,2015-12-31,3.394777
3,2015-10-31,2.487805
4,2015-09-30,3.629032
5,2015-08-31,3.137682


_____________________________________________________________________________________________

# Building Dataset