# Introduction to Pandas


Before using it, however, we need to import it.

In [1]:
import pandas as pd

In [2]:
pd.__version__

'1.2.1'

## Reading data into Pandas



In [12]:
#List of dictionaries
websites = [
    {'site': 'Twitter', 'type': 'Social Media', 'views': 10000, 'active_users': 200000},
    {'site': 'Facebook', 'type': 'Social Media', 'views': 35000, 'active_users': 500000},
    {'site': 'NYT', 'type': 'News media', 'views': 78000, 'active_users': 156000},    
    {'site': 'YouTube', 'type': 'Video platform', 'views': 18000, 'active_users': 289000},
    {'site': 'Vimeo', 'type': 'Video platform', 'views': 300, 'active_users': 1580},
    {'site': 'USA Today', 'type': 'News media', 'views': 4800, 'active_users': 5608},
]

In [13]:
websites

[{'site': 'Twitter',
  'type': 'Social Media',
  'views': 10000,
  'active_users': 200000},
 {'site': 'Facebook',
  'type': 'Social Media',
  'views': 35000,
  'active_users': 500000},
 {'site': 'NYT', 'type': 'News media', 'views': 78000, 'active_users': 156000},
 {'site': 'YouTube',
  'type': 'Video platform',
  'views': 18000,
  'active_users': 289000},
 {'site': 'Vimeo',
  'type': 'Video platform',
  'views': 300,
  'active_users': 1580},
 {'site': 'USA Today',
  'type': 'News media',
  'views': 4800,
  'active_users': 5608}]

In [18]:
pd.DataFrame(websites)

Unnamed: 0,site,type,views,active_users
0,Twitter,Social Media,10000,200000
1,Facebook,Social Media,35000,500000
2,NYT,News media,78000,156000
3,YouTube,Video platform,18000,289000
4,Vimeo,Video platform,300,1580
5,USA Today,News media,4800,5608


In [19]:
df_websites = pd.DataFrame(websites)

In [20]:
#Read from a csv
df_websites = pd.read_csv('websites.csv')

In [21]:
df_websites

Unnamed: 0,site,type,views,active_users
0,Twitter,Social Media,10000,200000
1,Facebook,Social Media,35000,500000
2,NYT,News media,78000,156000
3,YouTube,Video platform,18000,289000
4,Vimeo,Video platform,300,1580
5,USA Today,News media,4800,5608


## Exploring this dataset

Which columns are available?

In [9]:
df_websites.columns

Index(['active_users', 'site', 'type', 'views'], dtype='object')

Are there missing values?

In [10]:
df_websites.isna().sum()

active_users    0
site            0
type            0
views           0
dtype: int64

Let's see the first few values

In [11]:
df_websites.head()

Unnamed: 0,active_users,site,type,views
0,200000,Twitter,Social Media,10000
1,500000,Facebook,Social Media,35000
2,156000,NYT,News media,78000
3,289000,YouTube,Video platform,18000
4,1580,Vimeo,Video platform,300


And now the last few values

In [12]:
df_websites.tail()

Unnamed: 0,active_users,site,type,views
1,500000,Facebook,Social Media,35000
2,156000,NYT,News media,78000
3,289000,YouTube,Video platform,18000
4,1580,Vimeo,Video platform,300
5,5608,USA Today,News media,4800


Let's look at some descriptive statistics...

In [13]:
df_websites.describe()

Unnamed: 0,active_users,views
count,6.0,6.0
mean,192031.333333,24350.0
std,187954.647813,28977.905376
min,1580.0,300.0
25%,43206.0,6100.0
50%,178000.0,14000.0
75%,266750.0,30750.0
max,500000.0,78000.0


Only numerical variables appear above... let's see the frequencies for the non-numerical variables

In [14]:
df_websites['type'].describe()

count              6
unique             3
top       News media
freq               2
Name: type, dtype: object

This is not very informative... let's try to get the counts per value of the column

In [15]:
df_websites['type'].value_counts()

News media        2
Social Media      2
Video platform    2
Name: type, dtype: int64

Now let's get descriptive statistics per group:

In [16]:
df_websites.groupby('type').describe()

Unnamed: 0_level_0,active_users,active_users,active_users,active_users,active_users,active_users,active_users,active_users,views,views,views,views,views,views,views,views
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
News media,2.0,80804.0,106343.203036,5608.0,43206.0,80804.0,118402.0,156000.0,2.0,41400.0,51760.216383,4800.0,23100.0,41400.0,59700.0,78000.0
Social Media,2.0,350000.0,212132.034356,200000.0,275000.0,350000.0,425000.0,500000.0,2.0,22500.0,17677.66953,10000.0,16250.0,22500.0,28750.0,35000.0
Video platform,2.0,145290.0,203236.631049,1580.0,73435.0,145290.0,217145.0,289000.0,2.0,9150.0,12515.790027,300.0,4725.0,9150.0,13575.0,18000.0


This doesn't look so easy to read. Let's transpose this output

By transposing a dataframe we move the rows data to columns and the columns data to the rows. 

In [17]:
df_websites.groupby('type').describe().transpose()

Unnamed: 0,type,News media,Social Media,Video platform
active_users,count,2.0,2.0,2.0
active_users,mean,80804.0,350000.0,145290.0
active_users,std,106343.203036,212132.034356,203236.631049
active_users,min,5608.0,200000.0,1580.0
active_users,25%,43206.0,275000.0,73435.0
active_users,50%,80804.0,350000.0,145290.0
active_users,75%,118402.0,425000.0,217145.0
active_users,max,156000.0,500000.0,289000.0
views,count,2.0,2.0,2.0
views,mean,41400.0,22500.0,9150.0


## Subsetting and slicing

* Let's say I just want some of the **columns** that there are in the dataset
* Or that I just want some of the **rows** that are in the dataset

### Slicing by column

In [18]:
df_websites.columns

Index(['active_users', 'site', 'type', 'views'], dtype='object')

In [19]:
df_websites[['type',  'views']]

Unnamed: 0,type,views
0,Social Media,10000
1,Social Media,35000
2,News media,78000
3,Video platform,18000
4,Video platform,300
5,News media,4800


In [20]:
df_websites

Unnamed: 0,active_users,site,type,views
0,200000,Twitter,Social Media,10000
1,500000,Facebook,Social Media,35000
2,156000,NYT,News media,78000
3,289000,YouTube,Video platform,18000
4,1580,Vimeo,Video platform,300
5,5608,USA Today,News media,4800


In [21]:
type_views = df_websites[['type',  'views']]

In [22]:
type_views

Unnamed: 0,type,views
0,Social Media,10000
1,Social Media,35000
2,News media,78000
3,Video platform,18000
4,Video platform,300
5,News media,4800


In [23]:
df_websites

Unnamed: 0,active_users,site,type,views
0,200000,Twitter,Social Media,10000
1,500000,Facebook,Social Media,35000
2,156000,NYT,News media,78000
3,289000,YouTube,Video platform,18000
4,1580,Vimeo,Video platform,300
5,5608,USA Today,News media,4800


### Slicing by row (value)

Filtering dataset based on values in columns

In [24]:
df_websites[df_websites['type']=='Social Media']

Unnamed: 0,active_users,site,type,views
0,200000,Twitter,Social Media,10000
1,500000,Facebook,Social Media,35000


In [25]:
df_websites[df_websites['type']!='News media']

Unnamed: 0,active_users,site,type,views
0,200000,Twitter,Social Media,10000
1,500000,Facebook,Social Media,35000
3,289000,YouTube,Video platform,18000
4,1580,Vimeo,Video platform,300


I want to have data that is not about News Media **and** with more than 12,000 views

In [26]:
df_websites[(df_websites['type']!='News media') & (df_websites['views'] > 12000)]

Unnamed: 0,active_users,site,type,views
1,500000,Facebook,Social Media,35000
3,289000,YouTube,Video platform,18000


I want to have data that is **either** not about News Media **or** with more than 12,000 views

In [27]:
df_websites[(df_websites['type']!='News media') | (df_websites['views'] > 12000)]

Unnamed: 0,active_users,site,type,views
0,200000,Twitter,Social Media,10000
1,500000,Facebook,Social Media,35000
2,156000,NYT,News media,78000
3,289000,YouTube,Video platform,18000
4,1580,Vimeo,Video platform,300


In [28]:
social_media = df_websites[df_websites['type']=='Social Media']

In [29]:
social_media

Unnamed: 0,active_users,site,type,views
0,200000,Twitter,Social Media,10000
1,500000,Facebook,Social Media,35000


In [30]:
social_media.describe()

Unnamed: 0,active_users,views
count,2.0,2.0
mean,350000.0,22500.0
std,212132.034356,17677.66953
min,200000.0,10000.0
25%,275000.0,16250.0
50%,350000.0,22500.0
75%,425000.0,28750.0
max,500000.0,35000.0


In [31]:
df_websites[df_websites['type']=='Social Media'].describe()

Unnamed: 0,active_users,views
count,2.0,2.0
mean,350000.0,22500.0
std,212132.034356,17677.66953
min,200000.0,10000.0
25%,275000.0,16250.0
50%,350000.0,22500.0
75%,425000.0,28750.0
max,500000.0,35000.0


In [32]:
socialmediaviews = df_websites[df_websites['type']=='Social Media'][['type',  'views']]

In [33]:
socialmediaviews

Unnamed: 0,type,views
0,Social Media,10000
1,Social Media,35000


## Saving the dataframe

Formats you can use : see https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html

CSV:

In [16]:
df_websites.to_csv('websites.csv')

Pickle:

In [35]:
df_websites.to_pickle('websites.pkl')