In [1]:
import pandas as pd
import numpy as np

A **Series** is a one-dimensional labeled array that can hold any data type (integers, strings, floating points, Python objects, etc.). The axis labels are collectively referred to as the index. A Series is somewhat similar to a dictionary in that you can access the elements by their labels.

In [25]:
data1 = pd.Series([1, 3, 5, np.nan, 6, 8])
print("The data1 series is: \n", data)
print()
print("The value of the data1 series is :\n", data1.values)
print()
print("The index of the data1 series is :\n", data1.index)

The data1 series is: 
 1    10
2    11
3    12
9    13
5    14
dtype: int64

The value of the data1 series is :
 [ 1.  3.  5. nan  6.  8.]

The index of the data1 series is :
 RangeIndex(start=0, stop=6, step=1)


In [27]:
# Create a series with a dictionary
data2 = {"one": 1, "two": 2, "three": 3, "four": 4, "five": 5}
data_series2 = pd.Series(data2)
print(data_series2)

one      1
two      2
three    3
four     4
five     5
dtype: int64


### Explicit indexing / Label-based indexing
Explicit indexing allows you to assign a custom label (which could be a number, a string, or even a datetime) to each data element.

In [28]:
# Create a pandas Series from a dictionary, and specify an index for the series.
data3 = {4: "four", 5: "five", 1: "one", 3: "three", 2: "two"}
data_series3 = pd.Series(data3, index=[2, 1, 3])
print(data_series3)

2      two
1      one
3    three
dtype: object


In [3]:
data4 = pd.Series([10, 11, 12, 13, 14], index=[1,2,3,9,5])

print("data4.values = ", data4.values)
print(type(data4.values))
print()
print("data4.indexs = ", data4.index)
print(type(data4.index))

data4.values =  [10 11 12 13 14]
<class 'numpy.ndarray'>

data4.indexs =  Index([1, 2, 3, 9, 5], dtype='int64')
<class 'pandas.core.indexes.base.Index'>


In [30]:
print(data4[1])

10


In [31]:
print(data4[:9])

1    10
2    11
3    12
9    13
5    14
dtype: int64


In [4]:
print(data4[:4])

1    10
2    11
3    12
9    13
dtype: int64


In [6]:
data5 = pd.Series([10, 11, 12, 13, 14], index=[54, 32, 2, 1, 9])
print("data5[32] = ", data5[32])
print("data5[:1] = ", data5[:1])

data5[32] =  11
data5[:1] =  54    10
dtype: int64


In [2]:
# String as explicit index
data6 = {"one": 1, "two": 2, "three": 3, "four": 4, "five": 5}
data_series6 = pd.Series(data6)
print(data_series6)

one      1
two      2
three    3
four     4
five     5
dtype: int64


In [3]:
print("data_series6['four'] = ", data_series6['four'])

data_series6['four'] =  4


In [6]:
print("data_series6[:'four'] = \n", data_series6[:'four'])

data_series6[:'four'] = 
 one      1
two      2
three    3
four     4
dtype: int64


### Locator
`loc` is a label-based data selection method which means that we have to pass the name of the row or column which we want to access. This method includes the last element of the range passed in it, unlike python and iloc function. It can accept the boolean data, it means we can pass the condition based series of boolean to get the filtered data.

In [10]:
data7 = pd.Series([10, 11, 12, 13, 14], index=[3, 5, 7, 9, 1])
print("data7 = \n", data7)
print()
print("data7.loc[3] = ", data7.loc[3])
print()
print("data7.loc[3:9] = \n", data7.loc[3:9])

data7 = 
 3    10
5    11
7    12
9    13
1    14
dtype: int64

data7.loc[3] =  10

data7.loc[3:9] = 
 3    10
5    11
7    12
9    13
dtype: int64


`iloc` is an integer index-based method which means that we have to pass integer index in the method to access a specific row/column. This method does not include the last element of the range.

In [11]:
data8 = pd.Series([10, 11, 12, 13, 14], index=[3, 5, 7, 9, 1])
print("data8 = \n", data8)
print()
print("data8.iloc[3] = ", data8.iloc[3])
print()
print("data8.iloc[3:9] = \n", data8.iloc[3:9])

data8 = 
 3    10
5    11
7    12
9    13
1    14
dtype: int64

data8.iloc[3] =  13

data8.iloc[3:9] = 
 9    13
1    14
dtype: int64


### DataFrame

A DataFrame is a two-dimensional, size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns). It's similar to a spreadsheet, a SQL table, or the data.frame in R.

Here are some key characteristics of a DataFrame:

**Two-dimensional**: DataFrames are two-dimensional in the sense that data is aligned in a tabular fashion in rows and columns.

**Size Mutable**: The size of DataFrame is mutable, or changeable. You can add more rows or columns to a DataFrame, or you can delete existing rows or columns.

**Potentially Heterogeneous**: A DataFrame can contain data of different types. For example, one column can be of integer type, another column can be of boolean type, and so on.

**Labeled Axes**: The rows and columns of a DataFrame are labeled, which means that they can be referred to by their labels instead of their position indices.

In [14]:
data9 = {
    'Name': ['John', 'Anna', 'Peter'],
    'Age': [28, 24, 33],
    'Country': ['USA', 'Canada', 'Germany']
}

df9 = pd.DataFrame(data9)

print(df9)

    Name  Age  Country
0   John   28      USA
1   Anna   24   Canada
2  Peter   33  Germany


In [16]:
# Create a dataframe with a series
data_series7 = pd.Series([1, 2, 3, 4, 5])
print("data_series7 = \n", data_series7)
print()
df10 = pd.DataFrame(data_series7, columns=['digits'])
print("df10 = \n", df10)

data_series7 = 
 0    1
1    2
2    3
3    4
4    5
dtype: int64

df10 = 
    digits
0       1
1       2
2       3
3       4
4       5


In [22]:
# Create a dataframe with a dictionary
dict11 = [{"Ones": i, "Tens": i*10, "Hundreds": i*100} for i in range(1, 5)]
print("dict = ", dict11)
print()
df11 = pd.DataFrame(dict11)
print("df11 = \n", df11)

dict =  [{'Ones': 1, 'Tens': 10, 'Hundreds': 100}, {'Ones': 2, 'Tens': 20, 'Hundreds': 200}, {'Ones': 3, 'Tens': 30, 'Hundreds': 300}, {'Ones': 4, 'Tens': 40, 'Hundreds': 400}]

df11 = 
    Ones  Tens  Hundreds
0     1    10       100
1     2    20       200
2     3    30       300
3     4    40       400


In [24]:
# Creating a DataFrame from a list of dictionaries, with each dictionary corresponding to a row. 
# In cases where a key (column name) is absent in a dictionary, pandas fills in with NaN for that cell.
dict12 = [{"a": 1, "b": 1}, {"b": 2, "c": 2}, {"c": 3, "d": 3}]
df12 = pd.DataFrame(dict12)
print("df12 = \n", df12)

df12 = 
      a    b    c    d
0  1.0  1.0  NaN  NaN
1  NaN  2.0  2.0  NaN
2  NaN  NaN  3.0  3.0


In [25]:
# Create a dataframe with a series
data_series8 = pd.Series([i for i in range(1, 6)], index=[1, 2, 3, 4, 5])
data_series9 = pd.Series([i*10 for i in range(1, 6)], index=[3, 4, 5, 6, 7])
df13 = pd.DataFrame({"Ones": data_series8, "Tens": data_series9})
print("df13 = \n", df13)

df13 = 
    Ones  Tens
1   1.0   NaN
2   2.0   NaN
3   3.0  10.0
4   4.0  20.0
5   5.0  30.0
6   NaN  40.0
7   NaN  50.0


In [26]:
# Create a dataframe with a two dimensional numpy array
df14 = pd.DataFrame(np.zeros([5, 3]),
                   columns = ["A", "B", "C"],
                   index = ["one", "two", "three", "four", "five"])
print("df14 = \n", df14)

df14 = 
          A    B    C
one    0.0  0.0  0.0
two    0.0  0.0  0.0
three  0.0  0.0  0.0
four   0.0  0.0  0.0
five   0.0  0.0  0.0


### Index

In [27]:
# Index is immutable array
idx = pd.Index([2, 4, 6, 8, 10])
print("idx = ", idx)
print("idx[1:4] = ", idx[1:4])
print("idx.size = ", idx.size)
print("idx.shape = ", idx.shape)
print("idx.ndim = ", idx.ndim)
print("idx.dtype = ", idx.dtype)

idx =  Index([2, 4, 6, 8, 10], dtype='int64')
idx[1:4] =  Index([4, 6, 8], dtype='int64')
idx.size =  5
idx.shape =  (5,)
idx.ndim =  1
idx.dtype =  int64


In [30]:
idx_1 = pd.Index([1, 3, 5, 6, 7,])
idx_2 = pd.Index([2, 4, 6, 7, 8, 9])

print("idx_1 & idx_2 = ", pd.Index(set(idx_1) & set(idx_2)))
print("idx_1 | idx_2 = ", pd.Index(set(idx_1) | set(idx_2)))
print("idx_1 ^ idx_2 = ", pd.Index(set(idx_1) ^ set(idx_2)))

idx_1 & idx_2 =  Index([6, 7], dtype='int64')
idx_1 | idx_2 =  Index([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')
idx_1 ^ idx_2 =  Index([1, 2, 3, 4, 5, 8, 9], dtype='int64')
