### pandas series
- series are numpy arrays to make them eaiser to work with.
- series also contain an index and an optional name
- two or more series grouped togather form a data frame


In [29]:
import pandas as pd
import numpy as np

In [26]:
sales = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]
# series is stored as a 2D array containing the index and the values
sales_series = pd.Series(sales, name='Sales')
# series must be one dimensional
# below gives an array because it is two dimensional 3x2 matrix
pd.Series(np.arange(6).reshape(3, 2), name='Sales')

1

#### Series properties

In [40]:
# values attribute returns the values of the series
# values is an ndarray so you can use numpy functions on it
sales_series.values
# index attribute returns the index of the series
sales_series.index
# name attribute returns the name of the series
sales_series.name
# dtype attribute returns the datatype of the series
sales_series.dtype
# shape attribute returns the shape of the series
sales_series.shape
# size attribute returns the size of the series
sales_series.size
# ndim attribute returns the number of dimensions of the series
sales_series.ndim
# mean method returns the mean of the series
sales_series.mean()
# median method returns the median of the series
sales_series.median()
# std method returns the standard deviation of the series
sales_series.std()
# crux is that you can use numpy functions on pandas series

15.138251770487457

### pandas data types
- Expands mostly on their base python and numpy equivalants
- bool
- int64
- float64
- boolean: Nullable boolean
- Int64: Nullable int
- Float64: Nullable float
- catagory; Maps catagorical data to a numeric array for efficiency
- datatime64
- timedelta: duration between two dates or time
- period: a span of time

In [168]:
# series can be accessed using an index
sales_series[0] = 100
# you can apply custom label to the data
sales_series['a'] = 200
sales_series['a']

# you can use it like dictoinary
sales = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
sales_series = pd.Series(sales, index=labels, name='Sales')
sales_series
# slicing using labels
sales_series['a':'d']
# slice using index from index 1 to end get the alternate element
sales_series[1::2]

# The iloc attribute is used to access the data using the index
# df.iloc[row_index, column_index]
# df.iloc[row_index : row_index]
# row from 0 to 4
sales_series.iloc[0:4]

# The loc attribute is used to access the data using the label
# df.loc[row_label, column_label]
# df.loc[row_label : row_label]
# row from a to d
sales_series.loc['a':'d']
# this will reset the index of the series
sales_series.reset_index(drop=True)

# indexes can be duplicate
data = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]
labels = ['a', 'a', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'a']
data_series = pd.Series(data, index=labels, name='Sales')
# returns all the values with label a in a series
data_series.loc['a']
# this will make the data as dataframe
#data_series.reset_index()
# this will keep the data as the series but resets the labels
data_series.reset_index(drop=True)
data_series.iloc[:1]

data = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
data_series = pd.Series(data, index=labels, name='Sales')
# filter the data where the value is greater than 5
data_series[data_series > 5]
# filter using the loc attribute
mask = (data_series.gt(5)) & (data_series < 30)
data_series.loc[mask]

# you can use the where method to filter the data
data_series.where(mask)

# sorting series
newseries = data_series.sort_values(ascending=False)
newseries
# sort the series by index
data_series.sort_index(ascending=False, inplace=True)
data_series

# Numeric operation on series
data = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]
data_series = pd.Series(data, name='Sales')
# add 5 to all the values
add_series = data_series.add(5)
add_series
# subtract 5 from all the values
sub_series = data_series.sub(5)
sub_series

# add two series
data = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
data_series1 = pd.Series(data, name='Sales1')
# fill_value is used to fill the missing values with the value
# data_series has 10 values and data_series1 has 11 values so 1 missing value in data_series will be filled with 0
add_series = data_series1.add(data_series, fill_value=0)
add_series

# you can multiply, divide, subtract, add and do many other operations on the series

# Text series operations
data = ['apple', 'banana', 'cherry', 'dates', 'eggplant']
data_series = pd.Series(data, name='Fruits')
# get the length of the strings
data_series.str.len()
# convert the strings to uppercase
upper_series = data_series.str.upper()
upper_series
# convert the strings to lowercase
lower_series = data_series.str.lower()
lower_series

# split the strings
df = data_series.str.split('a')
df

# numerical series aggregation
data = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]
data_series = pd.Series(data, name='Sales')
# sum of the series
s = data_series.sum()
s
# mean of the series
m = data_series.mean()
m
# catagorical series aggregation
data = ['apple', 'banana', 'cherry', 'dates', 'eggplant']
data_series = pd.Series(data, name='Fruits')
vals = data_series.unique()
vals
# get the number of unique values
n = data_series.nunique()
n
# get the frequency of each value
f = data_series.value_counts()
f

# mssing values
# NAN is used to represent missing values
data = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]
data_series = pd.Series(data, name='Sales')
# add a missing value
data_series[0] = np.nan
data_series
# identify the missing values
# if there is an nan value present then panda will coerce the data type to float
data_series.isnull()
# fill the missing values
data_series.fillna(0)
# you can also use numpy's NAN to fill the missing values
data_series.fillna(pd.NA)
# drop the missing values
all_values_series = data_series.dropna()
all_values_series

# this is also used to identify the missing values
data_series.isna()

# applying custom functions to the series
data = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]
data_series = pd.Series(data, name='Sales')
# apply the custom function to the series
mul2_series = data_series.apply(lambda x: x * 2)
mul2_series
# # you can also use numpy functions
sqrt_series = data_series.apply( lambda x: np.sqrt(x) if x > 10 else x)
sqrt_series

# you can also use the map function to apply the custom function
m = data_series.map(lambda x: x * 2)
m

data = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]
data_series = pd.Series(data, name='Sales')
# find all the even numbers in the series
data_series.where(lambda x: x % 2 == 0, inplace=True)
data_series.isna()

0    False
1     True
2    False
3     True
4    False
5     True
6    False
7     True
8    False
9     True
Name: Sales, dtype: bool