## Pandas

Deal with missing data/ Import & Export / Combine data
<hr>
To preserve data consistency: must have data values of a single type stored in my series object or in each column of my DataFrame

In [22]:
import pandas as pd
import numpy as np

## Pandas series 

Data in a series belongs to one data-type only

In [2]:
products = ['A', 'B', 'C', 'D']

In [3]:
type(products)

list

In [4]:
products_categories = pd.Series(products)

In [5]:
print(products_categories)

0    A
1    B
2    C
3    D
dtype: object


In [6]:
type(products_categories)

pandas.core.series.Series

In [7]:
import numpy as np


In [8]:
array_a = np.array([10,20,30,40,50])
array_a

array([10, 20, 30, 40, 50])

In [9]:
series_a = pd.Series(array_a)
series_a

0    10
1    20
2    30
3    40
4    50
dtype: int32

## Working with attributes in Python

In [10]:
series = pd.Series([1,2,3,4])

In [12]:
series.dtype

dtype('int64')

In [13]:
series.size

4

In [15]:
products_categories.dtype

dtype('O')

In [16]:
products_categories.name

In [17]:
products_categories.name = "Product categories"

In [18]:
products_categories.name

'Product categories'

## Using an index in Python: index allows fast retrieval

In [19]:
price_per_product = {'Product A': 22250, 'Product B': 16600, 'Product C': 15600}
price_per_product

{'Product A': 22250, 'Product B': 16600, 'Product C': 15600}

In [21]:
# key-value pairs are preserved, in which key is the index of the series
price_per_category = pd.Series(price_per_product)
price_per_category

Product A    22250
Product B    16600
Product C    15600
dtype: int64

In [23]:
# access series index
price_per_category.index

Index(['Product A', 'Product B', 'Product C'], dtype='object')

In [24]:
type(price_per_category.index)

pandas.core.indexes.base.Index

## Label-based and position based indexing

In [25]:
series_a = pd.Series([10,20,30,40,50])
series_a

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [26]:
# RangeIndex - position-based indexing
series_a.index

RangeIndex(start=0, stop=5, step=1)

In [27]:
type(series_a.index)

pandas.core.indexes.range.RangeIndex

In [28]:
list(series_a.index)

[0, 1, 2, 3, 4]

In [29]:
# Label-based indexing
price_per_category.index

Index(['Product A', 'Product B', 'Product C'], dtype='object')

## Indexing

In [30]:
series_a

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [31]:
# position-based indexing
series_a[0]

10

In [32]:
price_per_category

Product A    22250
Product B    16600
Product C    15600
dtype: int64

In [34]:
# label-based indexing
price_per_category['Product A']

22250

In [35]:
# position-based
price_per_category[0]

22250

In [38]:
# explicit index
series_b = pd.Series([10,20,30,40,50], index=[1, 2, 3, 4, 5])

In [40]:
 series_b[1]

10

In [43]:
series_c = pd.Series([10,20,30,40,50], index=["1", "2", "3", "4", "5"])

In [44]:
series_c[1]

20

In [46]:
series_c["1"]

10

## Using methods in Python

In [10]:
start_date_deposit = pd.Series({
    '7/4/2014': 2000,
    '1/2/2015': 2000,
    '12/8/2012': 2000,
    '3/2/2011': 4000
})



In [11]:
start_date_deposit 

7/4/2014     2000
1/2/2015     2000
12/8/2012    2000
3/2/2011     4000
dtype: int64

In [12]:
start_date_deposit.sum()

10000

In [13]:
start_date_deposit.min()

2000

In [14]:
start_date_deposit.idxmax()

'3/2/2011'

In [15]:
start_date_deposit.head()

7/4/2014     2000
1/2/2015     2000
12/8/2012    2000
3/2/2011     4000
dtype: int64

In [16]:
start_date_deposit.tail()

7/4/2014     2000
1/2/2015     2000
12/8/2012    2000
3/2/2011     4000
dtype: int64

## Parameters and Arguments


In [17]:
start_date_deposit.head(3)

7/4/2014     2000
1/2/2015     2000
12/8/2012    2000
dtype: int64

In [19]:
start_date_deposit.head(n=2)

7/4/2014    2000
1/2/2015    2000
dtype: int64

##  Pandas documentation

https://pandas.pydata.org/docs/

In [None]:
start_date_deposit.head()

## Create dataframe from scratch

In [3]:
# Create a dataframe from a dictionary of lists
data = {'ProductName': ['Product A', 'Product B', 'Product C'],'Price': [22250, 16600, 12500]}
df = pd.DataFrame(data)

In [4]:
df

Unnamed: 0,ProductName,Price
0,Product A,22250
1,Product B,16600
2,Product C,12500


In [5]:
# # Create a dataframe from a dictionary of lists + specify the index
df = pd.DataFrame(data, index=['A', 'B', 'C'])
df


Unnamed: 0,ProductName,Price
A,Product A,22250
B,Product B,16600
C,Product C,12500


In [6]:
# create a dataframe from a list of dictionaries
data = [{'ProductName': 'ProductA', "Price": 22250},
       {'ProductName': 'ProductB', 'Price': 16600}, 
       {'ProductName': 'ProductC', 'Price': 12500}]
df = pd.DataFrame(data)
df

Unnamed: 0,ProductName,Price
0,ProductA,22250
1,ProductB,16600
2,ProductC,12500


In [8]:
# a dictionary of Series
data = {'ProductName': pd.Series(['Product A', 'Product B', 'Product C']),'Price': pd.Series([22250, 16600, 12500])}
df = pd.DataFrame(data)
df

Unnamed: 0,ProductName,Price
0,Product A,22250
1,Product B,16600
2,Product C,12500


In [10]:
# a dictionary of Series + index
products = pd.Series(['Product A', 'Product B', 'Product C'], index=['A', 'B', 'C'])
prices = pd.Series([22250, 16600, 12500], index=['A', 'B', 'C'])
data = {'ProductName': products,'Price': prices}
df = pd.DataFrame(data)
df

Unnamed: 0,ProductName,Price
A,Product A,22250
B,Product B,16600
C,Product C,12500


In [11]:
# from a list of list
df = pd.DataFrame(data=[['ProductA', 22250], ['ProductB', 16600], ['ProductC', 12500]], columns=['ProductName', 'Price'], index=['A', 'B', 'C'])

In [12]:
df


Unnamed: 0,ProductName,Price
A,ProductA,22250
B,ProductB,16600
C,ProductC,12500


In [13]:
df.shape

(3, 2)

## Data Cleaning and Preprocessing: pd.Series

#### .unique() & .nunique()

In [14]:
data = pd.read_csv('Location.csv', squeeze=True)
location_data = data.copy()
location_data.head()



  data = pd.read_csv('Location.csv', squeeze=True)


0     Location 3
1     Location 6
2     Location 8
3    Location 26
4    Location 34
Name: Location, dtype: object

In [15]:
type(location_data)

pandas.core.series.Series

In [16]:
location_data.describe()

count            1043
unique            296
top       Location 25
freq               31
Name: Location, dtype: object

In [17]:
len(location_data)

1043

In [19]:
location_data.nunique()

296

In [21]:
location_data.unique()

array(['Location 3', 'Location 6', 'Location 8', 'Location 26',
       'Location 34', 'Location 25', 'Location 46', 'Location 156',
       'Location 21', 'Location 13', 'Location 579', 'Location 602',
       'Location 10', 'Location 44', 'Location 30', 'Location 48',
       'Location 196', 'Location 64', 'Location 91', 'Location 62',
       'Location 75', 'Location 42', 'Location 233', 'Location 95',
       'Location 78', 'Location 61', 'Location 87', 'Location 19',
       'Location 115', 'Location 350', 'Location 377', 'Location 17',
       'Location 113', 'Location 81', 'Location 58', 'Location 212',
       'Location 53', 'Location 337', 'Location 41', 'Location 632',
       'Location 73', 'Location 214', 'Location 218', 'Location 38',
       'Location 172', 'Location 197', 'Location 101', 'Location 185',
       'Location 129', 'Location 235', 'Location 142', 'Location 50',
       'Location 76', 'Location 11', 'Location 33', 'Location 22',
       'Location 145', 'Location 203', 'Loca

### Converting Series into Arrays


In [34]:
s = pd.Series(data={'ProductA': 22250, 'ProductB': 12345, 'ProductC': 56789})

In [29]:
# not use this one
s.values

array([22250, 12345, 56789], dtype=int64)

In [30]:
s.array

<PandasArray>
[22250, 12345, 56789]
Length: 3, dtype: int64

In [31]:
# built on top of numpy array
type(s.array)

pandas.core.arrays.numpy_.PandasArray

In [35]:
# convert to numpy
s.to_numpy()

array([22250, 12345, 56789], dtype=int64)

In [36]:
test_array = s[['ProductA', 'ProductB']].to_numpy(dtype='float')

In [37]:
test_array

array([22250., 12345.])

In [38]:
type(test_array[0])

numpy.float64

In [40]:
type(s.array[0])

numpy.int64

### .sort_values()

In [41]:
numbers = pd.Series([123, 56, 45, 46, 10])
numbers.sort_values()

4     10
2     45
3     46
1     56
0    123
dtype: int64

In [42]:
numbers.sort_values(ascending=False)

0    123
1     56
3     46
2     45
4     10
dtype: int64

### Attribute and method chainning

In [43]:
location_data.index

RangeIndex(start=0, stop=1043, step=1)

In [44]:
location_data.index.name

In [45]:
location_data.index.name = 'Index'

In [46]:
location_data

Index
0        Location 3
1        Location 6
2        Location 8
3       Location 26
4       Location 34
           ...     
1038    Location 73
1039    Location 82
1040    Location 11
1041    Location 26
1042    Location 94
Name: Location, Length: 1043, dtype: object

In [47]:
location_data.sort_values()

Index
637     Location 1
884     Location 1
465     Location 1
716    Location 10
623    Location 10
          ...     
482    Location 97
128    Location 97
669    Location 97
757    Location 98
372    Location 99
Name: Location, Length: 1043, dtype: object

In [48]:
location_data.sort_values().head()

Index
637     Location 1
884     Location 1
465     Location 1
716    Location 10
623    Location 10
Name: Location, dtype: object

In [49]:
location_data.index.to_numpy()

array([   0,    1,    2, ..., 1040, 1041, 1042], dtype=int64)

### .sort_index()

In [55]:
location_data_sv = location_data.sort_values(ascending=False)

In [56]:
location_data_sv.head()

Index
372    Location 99
757    Location 98
128    Location 97
482    Location 97
271    Location 97
Name: Location, dtype: object

In [61]:
location_data_sv = location_data.sort_values(ascending=False)

In [62]:
location_data_sv.index

Int64Index([ 372,  757,  128,  482,  271,  669,  612,   29,  518,  598,
            ...
             904,  912, 1010,   14,  716,  202,  298,  637,  884,  465],
           dtype='int64', name='Index', length=1043)

In [64]:
location_data_sv.index.array

<PandasArray>
[ 372,  757,  128,  482,  271,  669,  612,   29,  518,  598,
 ...
  904,  912, 1010,   14,  716,  202,  298,  637,  884,  465]
Length: 1043, dtype: int64

In [66]:
location_data_sv.index.sort_values()

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042],
           dtype='int64', name='Index', length=1043)