In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
# Creating a series out of a list
# We have an index, our data, and the data type
x = pd.Series([10, 20, 30, 40, 50])
x

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [3]:
# We can access different components separately
# Accessing the index
x.index

RangeIndex(start=0, stop=5, step=1)

In [4]:
# Accessing the values
x.values

array([10, 20, 30, 40, 50])

In [5]:
# Accessing the data type
# As Series is an ndarray, it is homogeneous and can't store multiple data types
x.dtype

dtype('int64')

In [18]:
# Creating a Series with Index
data = [450, 650, 870]
Sales = Series(data, index = ['Don', 'Mike', 'Edwin'])
Sales

Don      450
Mike     650
Edwin    870
dtype: int64

In [19]:
# We can check the type of Sales, it is a series
type(Sales)

pandas.core.series.Series

In [20]:
# When we check the index now, we can see the values rather than a range, since it is a string
Sales.index

Index([u'Don', u'Mike', u'Edwin'], dtype='object')

In [21]:
# Accessing values using the index name
Sales["Don"]

450

In [22]:
# Accessing values using a positional index
Sales[0]

450

In [23]:
# We can filter our data based on conditions we specify, we can use booleans to do this
# If we want sales greater than 500:
Sales > 500

Don      False
Mike      True
Edwin     True
dtype: bool

In [25]:
# We can use booleans to filter and show data that is True
Sales[[False, True, True]]

Mike     650
Edwin    870
dtype: int64

In [26]:
# To return values that are greater than 500 we need to use boolean
Sales[Sales > 500]

Mike     650
Edwin    870
dtype: int64

In [27]:
# Checking names in the Index
"Don" in Sales

True

In [28]:
"Sally" in Sales

False

In [29]:
# This will be false as it is a value and not in the index
450 in Sales

False

In [30]:
# Converting Series to dictionaries
sales_dict = Sales.to_dict()

In [31]:
sales_dict

{'Don': 450, 'Edwin': 870, 'Mike': 650}

In [32]:
# Converting dictionaries to Series
sales_ser = Series(sales_dict)

In [33]:
sales_ser

Don      450
Edwin    870
Mike     650
dtype: int64

In [34]:
# We can create a new Series from an already existing series.
# If we specify names in the index that were not there already, NaN values will be assigned 
new_sales = Series(Sales, index = ["Don", "Mike", "Sally", "Edwin", "Lucy"])
new_sales

Don      450.0
Mike     650.0
Sally      NaN
Edwin    870.0
Lucy       NaN
dtype: float64

In [35]:
# We can use numpy to check if entires are NaN
np.isnan(new_sales["Sally"])

True

In [36]:
# This is different to None
new_sales["Sally"] is None

False

In [37]:
# We can use isnan on entire Series
np.isnan(new_sales)

Don      False
Mike     False
Sally     True
Edwin    False
Lucy      True
dtype: bool

In [39]:
# checking for null values using pandas
pd.isnull(new_sales)

Don      False
Mike     False
Sally     True
Edwin    False
Lucy      True
dtype: bool

In [40]:
# naming an index
Sales.index.name = "Sales Person"

In [41]:
Sales

Sales Person
Don      450
Mike     650
Edwin    870
dtype: int64

In [42]:
# naming a Series
Sales.name = "Total TV Sales"

In [43]:
Sales

Sales Person
Don      450
Mike     650
Edwin    870
Name: Total TV Sales, dtype: int64

In [44]:
# creating a DataFrame from a list
data = [["Adrian", 20], ["Bethany", 23], ["Chloe", 41]]

# as we create a dataframe we can specify what the column names are and what the datatype is
df = pd.DataFrame(data, columns = ["Name", "Age"], dtype = int)
df

Unnamed: 0,Name,Age
0,Adrian,20
1,Bethany,23
2,Chloe,41


In [45]:
# Creating a DataFrame from a dictionary 
new_dict = {'Name':['Tom','Jane','Steve','Lucy'],'Sales':[250,300,350,400]}

# column names are automatically assigned from the keys
df_dict = pd.DataFrame(new_dict)
df_dict

Unnamed: 0,Name,Sales
0,Tom,250
1,Jane,300
2,Steve,350
3,Lucy,400


In [46]:
# Adding a custom index
# We can use the index parameter to add an index
df_dict_index = pd.DataFrame(new_dict, index=['rank1','rank2','rank3','rank4'])
df_dict_index

Unnamed: 0,Name,Sales
rank1,Tom,250
rank2,Jane,300
rank3,Steve,350
rank4,Lucy,400


In [47]:
# Creating a DataFrame from a list of Dictionaries
# This is the same data as we had previously, but in a different format
# This is useful when working with json
dict_list = [{'Name':'Tom','Sales':250},{'Name':'Jane','Sales':300},{'Name':'Steve','Sales':350}
            ,{'Name':'Lucy','Sales':400}]

df_dict_list = pd.DataFrame(dict_list)
df_dict_list

Unnamed: 0,Name,Sales
0,Tom,250
1,Jane,300
2,Steve,350
3,Lucy,400


In [48]:
# Creating a DataFrame from Dictionary of Series
east = pd.Series([1000,1200,3400],index=['Q1','Q2','Q3'])
west = pd.Series([1100,1300,2400,3500],index=['Q1','Q2','Q3','Q4'])
east
west

Q1    1100
Q2    1300
Q3    2400
Q4    3500
dtype: int64

In [49]:
# If we have series we want to put into a DataFrame, we can easily combine them together
# If we wanted a DataFrame from a single series, we can do that by passing in the single series
df_region = pd.DataFrame({'East':east,'West':west})
df_region

Unnamed: 0,East,West
Q1,1000.0,1100
Q2,1200.0,1300
Q3,3400.0,2400
Q4,,3500


In [50]:
# Once we have a DataFrame, we can easily add Series on
df_region['North'] = [2000,3000,2500,4000]
df_region['South'] = [1500,2000,1500,4000]
df_region

Unnamed: 0,East,West,North,South
Q1,1000.0,1100,2000,1500
Q2,1200.0,1300,3000,2000
Q3,3400.0,2400,2500,1500
Q4,,3500,4000,4000


In [51]:
# If we made a mistake and need to set a new index, we can add a new column
# and set that new column as the index

years = ['2016','2017','2018','2019']
df_region['years'] = years
df_region

Unnamed: 0,East,West,North,South,years
Q1,1000.0,1100,2000,1500,2016
Q2,1200.0,1300,3000,2000,2017
Q3,3400.0,2400,2500,1500,2018
Q4,,3500,4000,4000,2019


In [52]:
# use set_index to set the index to a different column in the DataFrame
df_region = df_region.set_index('years')
df_region

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,1000.0,1100,2000,1500
2017,1200.0,1300,3000,2000
2018,3400.0,2400,2500,1500
2019,,3500,4000,4000


In [53]:
# if we want to see different index values, we can use reindex
# reindex will shift our index
new_df = df_region.reindex(['2017','2018','2019','2020','2021'])
new_df

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,1200.0,1300.0,3000.0,2000.0
2018,3400.0,2400.0,2500.0,1500.0
2019,,3500.0,4000.0,4000.0
2020,,,,
2021,,,,


In [54]:
# reindex can also be used on columns
# We can shift our columns, or add new ones if we add a name that was not present before
re_indexed = new_df.reindex(columns=['North','East','South','New'])
re_indexed

Unnamed: 0_level_0,North,East,South,New
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,3000.0,1200.0,2000.0,
2018,2500.0,3400.0,1500.0,
2019,4000.0,,4000.0,
2020,,,,
2021,,,,


In [55]:
# Filling in missing values
# We may want to change all NaN values to 0 (or some specific number)
# This will be especially useful when working with certain types of algorithms
# Some algorithms cannot deal with NaN values

re_indexed.fillna(0) # we are not actually assigning this to a new dataframe

Unnamed: 0_level_0,North,East,South,New
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,3000.0,1200.0,2000.0,0.0
2018,2500.0,3400.0,1500.0,0.0
2019,4000.0,0.0,4000.0,0.0
2020,0.0,0.0,0.0,0.0
2021,0.0,0.0,0.0,0.0
