In [3]:
import pandas as pd
import numpy as np

Data structures are used for storing, organizing, and managing collection of data. Data structures allows performing operations on group of data.
Python has inbuilt support for data structures like List, Dictionary, Tuple and Set known as Build in data structure. There are user defined data structures as well like Stack, Queue, Tree, Linked list etc. Source for below picture : https://www.edureka.co/blog/data-structures-in-python/

![Data_Structure.PNG](attachment:Data_Structure.PNG)

## Lists
Lists are mutable in nature. Internally stored as array.

Operations available on list - append, insert, remove, pop, clear, index, sort, copy, reverse, count

Defined with square brackets []

In [59]:
lst = ['Creatinine_Phosphokinase', 'Anaemia','Diabetes', 'Ejection_fraction', 'High_Blood_Pressure','Platelets',
       'Serum_creatinine', 'Serum_sodium']
lst

['Creatinine_Phosphokinase',
 'Anaemia',
 'Diabetes',
 'Ejection_fraction',
 'High_Blood_Pressure',
 'Platelets',
 'Serum_creatinine',
 'Serum_sodium']

In [3]:
# Get elements based on index
# Time complexity is O(k) - k is number of slice to get. Like in below examples
# time complexity is O(3), O(3), O(4)
print(lst[2:5])
print(lst[:3])
print(lst[4:])

['Diabetes', 'Ejection_fraction', 'High_Blood_Pressure']
['Creatinine_Phosphokinase', 'Anaemia', 'Diabetes']
['High_Blood_Pressure', 'Platelets', 'Serum_creatinine', 'Serum_sodium']


In [4]:
# Appending item to list
# Time complexity O(1)
lst.append('Smoking')
lst[len(lst):] = ['Age']

In [5]:
lst

['Creatinine_Phosphokinase',
 'Anaemia',
 'Diabetes',
 'Ejection_fraction',
 'High_Blood_Pressure',
 'Platelets',
 'Serum_creatinine',
 'Serum_sodium',
 'Smoking',
 'Age']

In [6]:
# Index method - Return index of the item
# Time complexity 
#    Best case - O(1)
#    Worst case - O(n)

print(lst.index('Age'))

# Count Method - Return number of times an item appeared in a list
# Time complexity - O(n)

print(lst.count('Age'))

# Pop method - Delete an item from the list.
# Pop can be used by passing an index like lst.pop(4) will delete item at index 4
# Time complexity 
#    Best case - O(1) for pop() without passing any index. 
#    It will delete last element. Getting an element in list is O(1)
#    Worst case - O(n) - pop() given with an index. 
#    Delete indexed item and then rearrage the index of remaining items.

print(lst.pop()) 
print(lst)

9
1
Age
['Creatinine_Phosphokinase', 'Anaemia', 'Diabetes', 'Ejection_fraction', 'High_Blood_Pressure', 'Platelets', 'Serum_creatinine', 'Serum_sodium', 'Smoking']


In [7]:
# Delete item passed as argument
# Time complexity O(n) - Search the item and then rearrage the index for remaining items.

lst.remove('Smoking')

In [8]:
# Reverse the items of list in place
# Time complexity O(n) - Actually it is O(n/2) which is considered as O(n) but as Big O gives upper bound so 
# it will be considered as O(n)
lst.reverse()
lst
# Other way of reverse is lst[::-1]. Actually it gets elements starting from last skipping 0 elements so it gives 
# all elements in reverse direction . Try with [::-2] and see what you get. You should get elements starting from 
# last skipping one element.

['Serum_sodium',
 'Serum_creatinine',
 'Platelets',
 'High_Blood_Pressure',
 'Ejection_fraction',
 'Diabetes',
 'Anaemia',
 'Creatinine_Phosphokinase']

In [9]:
# Sort elements in list 
# Time complexity O(n log n). We can get the details from sorting algorithms.
lst.sort()
lst

['Anaemia',
 'Creatinine_Phosphokinase',
 'Diabetes',
 'Ejection_fraction',
 'High_Blood_Pressure',
 'Platelets',
 'Serum_creatinine',
 'Serum_sodium']

In [11]:
# Insert an item in list at the given index. It takes indexes in both positve and negative. Negative 
# indexes starts from last.
# Time complexity
#    Best case O(1) if element is inserted at last
#    Worst case O(n) if element inserted at begining of list as rest of elements need to move.
lst.insert(2, 'Smoking')
print(lst)
lst.insert(-1, 'Age')
print(lst)

['Anaemia', 'Creatinine_Phosphokinase', 'Smoking', 'Diabetes', 'Ejection_fraction', 'High_Blood_Pressure', 'Platelets', 'Serum_creatinine', 'Serum_sodium']
['Anaemia', 'Creatinine_Phosphokinase', 'Smoking', 'Diabetes', 'Ejection_fraction', 'High_Blood_Pressure', 'Platelets', 'Serum_creatinine', 'Age', 'Serum_sodium']


In [12]:
# Extend the list with another any iterable object.
# Time complexity O(k) - k is number of items in object to be appended.
# In below example time complexity is O(2)

lst1 = ['Time', 'Gender']
lst.extend(lst1)
print(lst)

['Anaemia', 'Creatinine_Phosphokinase', 'Smoking', 'Diabetes', 'Ejection_fraction', 'High_Blood_Pressure', 'Platelets', 'Serum_creatinine', 'Age', 'Serum_sodium', 'Time', 'Gender']


## Tuple
Tuples are immutable. They cannot be modified or updated. Tuples usually contain a heterogeneous sequence of elements that are accessed via unpacking.
Time complexity for all operations is O(1) are they do not support any operations which needs iteration through 
tuple like insert, update, remove and sort.

In [6]:
impacts=('Anaemia', 'Creatinine_Phosphokinase', 'Smoking', 'Diabetes', 'Ejection_fraction', 
         'High_Blood_Pressure', 'Platelets', 'Serum_creatinine', 'Serum_sodium')
print(impacts)
impacts=(23,'Anaemia', 'Creatinine_Phosphokinase', 'Smoking', 'Diabetes', 'Ejection_fraction', 
         'High_Blood_Pressure', 'Platelets', ['Serum_creatinine', 'Serum_sodium'])
print(impacts)

('Anaemia', 'Creatinine_Phosphokinase', 'Smoking', 'Diabetes', 'Ejection_fraction', 'High_Blood_Pressure', 'Platelets', 'Serum_creatinine', 'Serum_sodium')
(23, 'Anaemia', 'Creatinine_Phosphokinase', 'Smoking', 'Diabetes', 'Ejection_fraction', 'High_Blood_Pressure', 'Platelets', ['Serum_creatinine', 'Serum_sodium'])


In [4]:
# Count - Counts the number of instances of the item.
impacts.count("Smoking")

1

In [9]:
# Slicing 
impacts[4:]

('Diabetes',
 'Ejection_fraction',
 'High_Blood_Pressure',
 'Platelets',
 ['Serum_creatinine', 'Serum_sodium'])

In [13]:
# Unpacking
a,b,c,d,e=impacts[4:]
print(e)

['Serum_creatinine', 'Serum_sodium']


## Dictionary
Dictionaries store data in form of key value pairs. Dictionaries are indexed on keys, so keys can be any immutable object like number, strings or tuples. If a tuple is storing any item which can be directly or indirectly modified cannot be a key in Dictionary. Operations allowed in dictionary are get, delete, set item and iterating through dictionary.

In [26]:
# Iniializing a dictionary
impact_dict = {}
impact_dict = dict([('Serum_Cretinine',1.7),('Age',45)])
imapct_dict = {'Serum_Creatinine':1.7,'Age':45}

In [28]:
# Get an item based on key
# Time complexity O(1)
imapct_dict.get('Age')

45

In [30]:
# Add an item to dict
# Time complexity is O(1)
impact_dict['High_Blood_Pressure'] = 140
impact_dict

{'Serum_Cretinine': 1.7, 'Age': 45, 'High_Blood_Pressure': 140}

In [32]:
# Set or update an existing item
# Time complexity O(1)
impact_dict['Age'] = 50
impact_dict

{'Serum_Cretinine': 1.7, 'Age': 50, 'High_Blood_Pressure': 140}

In [56]:
%%time
# Iterating through dictionary
# Time complexity O(n)
for k,v in impact_dict.items():
    print('key {0} and value {1}'.format(k,v))

key Serum_Cretinine and value 1.7
key Age and value 50
key High_Blood_Pressure and value 140
CPU times: user 1.06 ms, sys: 0 ns, total: 1.06 ms
Wall time: 2.46 ms


In [57]:
%%time
# Iterating using .keys()
# Time complexity O(n)
for i in impact_dict.keys():
    print('Key is {0} and value is {1}'.format(i, impact_dict.get(i)))

Key is Serum_Cretinine and value is 1.7
Key is Age and value is 50
Key is High_Blood_Pressure and value is 140
CPU times: user 917 µs, sys: 0 ns, total: 917 µs
Wall time: 1.6 ms


In [58]:
# dir method - To see what variables and functionalities available with an object.

dir(impact_dict)

['__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'clear',
 'copy',
 'fromkeys',
 'get',
 'items',
 'keys',
 'pop',
 'popitem',
 'setdefault',
 'update',
 'values']

## Sets
A set is an unordered collection with no duplicate elements. Set support mathematical operations like union, intersection, difference, and symmetric difference. Set is created using {} braces or set(). 

In [8]:
# Intialisation of set
impacts_set = {'Diabetes',
 'Ejection_fraction',
 'High_Blood_Pressure',
 'Platelets','Diabetes'}

# or

impacts_set = set(['Diabetes',
 'Ejection_fraction',
 'High_Blood_Pressure',
 'Platelets','Diabetes'])
impacts_set

# Diabetes duplication is removed

{'Diabetes', 'Ejection_fraction', 'High_Blood_Pressure', 'Platelets'}

In [9]:
impacts_set1 = set(['Serum_creatinine', 'Serum_sodium', 'Ejection_fraction'])

# Union - Get all unique items from both the sets.
# Time complexity - O(len(set1) + len(set2))
impacts_set.union(impacts_set1)

{'Diabetes',
 'Ejection_fraction',
 'High_Blood_Pressure',
 'Platelets',
 'Serum_creatinine',
 'Serum_sodium'}

In [10]:
# Intersection - Get common items of the sets
# Time complexity - 
impacts_set.intersection(impacts_set1)

{'Ejection_fraction'}

In [12]:
# difference -  items of impacts_set not available in impacts_set1
print(impacts_set.difference(impacts_set1))

# difference -  items of impacts_set1 not available in impacts_set
print(impacts_set1.difference(impacts_set))

{'Platelets', 'Diabetes', 'High_Blood_Pressure'}
{'Serum_sodium', 'Serum_creatinine'}


In [15]:
# symmetric_difference - collective output of difference. Unique items present in individual sets only.

impacts_set.symmetric_difference(impacts_set1)

{'Diabetes',
 'High_Blood_Pressure',
 'Platelets',
 'Serum_creatinine',
 'Serum_sodium'}

## Numpy
Numpy is general array processing package. It provides a high-performance multidimensional array object, and tools for working with these arrays.
Numpy can perform mathematical operations on rows and columns level.

In [80]:
# Creation of two dimesional array
d2arr = np.array([[23,45,67,89,243],[45,67,32,34,123]])
print("Two dimensional array")
print(d2arr)
# Three dimensional array
print("Three dimensional array")
d3arr = np.array([[[33,44,55,66],[44,33,11,10]],[[11,22,33,66],[77,88,99,100]]])
d3arr

Two dimensional array
[[ 23  45  67  89 243]
 [ 45  67  32  34 123]]
Three dimensional array


array([[[ 33,  44,  55,  66],
        [ 44,  33,  11,  10]],

       [[ 11,  22,  33,  66],
        [ 77,  88,  99, 100]]])

In [104]:
print(d2arr.shape)
print(d3arr.shape)

(2, 5)
(2, 2, 4)


In [86]:
# Get sum
print(d2arr.sum())
print(d3arr.sum())
# Get sum row wise
print(d2arr.sum(axis=1))
print(d3arr.sum(axis=1))

768
792
[467 301]
[[ 77  77  66  76]
 [ 88 110 132 166]]


In [100]:
print(d2arr + 5)
print(d2arr - 5)

[[ 28  50  72  94 248]
 [ 50  72  37  39 128]]
[[ 18  40  62  84 238]
 [ 40  62  27  29 118]]


In [96]:
# Index of minimum element row wise
print(d2arr.argmin(axis=1))

# Index of maximum element column wise
print(d2arr.argmax(axis=0))

[0 2]
[1 1 0 0 0]


In [105]:
# Matrix multiplication - @ sign denotes matrix multiplication
print(d2arr @ [2,2,2,2,2])

print(d3arr @ [2,2,2,2])

[934 602]
[[396 196]
 [264 728]]


In [99]:
arr = [[2,3,4],[4,5,6]]
#print(arr+5)

## Pandas

Pandas is open source library provide easy to use data structure operations and data analysis. In this notebook section we will understand basic functionalities of pandas. Pandas is built mostly on numpy.

Pandas deals with three types of datastructures:

    1. Series - 1D labeled homogeneous data, size immutable.
    
    2. Dataframe - General 2D labeled, size-mutable tabular structure with potentially heterogeneously typed columns. Indexes are row wise and column wise. Default indexes are np.arrange(n).
    
    3. Panel - General 3D labeled array, size mutable.

In [8]:
# Creating Series
s = pd.Series(5,[0,1,2,3,4])
s1 = pd.Series([2,3,4,5,6], index=['a', 'b', 'c', 'd', 'e'])
# Using array
arr = [1,2,3,4,5]
s2 = pd.Series(arr)

In [14]:
# Accessing values
print(s1[:])
print("-----------------------")
print(s1['d'])
print("-----------------------")
s1[3:-1]

a    2
b    3
c    4
d    5
e    6
dtype: int64
-----------------------
5
-----------------------


d    5
dtype: int64

In [39]:
# Creation of Data frames 
# Dataframes can be created using lists, dict, Series, ndnumpy arrays, another dataframe
# Creating using list
lst = [3,2,4,5,762,6]
df_lst = pd.DataFrame(lst)
df_lst

Unnamed: 0,0
0,3
1,2
2,4
3,5
4,762
5,6


In [47]:
# Creation using dict

dict = [{0:'Diabetes',1:'High_Blood_Pressure',2:'Platelets',3:'Serum_creatinine',4:'Serum_sodium'}]
# dict = [{0:'Diabetes',1:'High_Blood_Pressure',2:'Platelets',3:'Serum_creatinine',4:'Serum_sodium'},
#          {0:'D',1:'P',2:'Q',3:'R',4:'G'}]
# dict = {'Impacts':['Diabetes','High_Blood_Pressure','Platelets','Serum_creatinine','Serum_sodium'],
#        'Age':[23,34,45,56,78]}
# dict = [{'a':'Diabetes','b':'High_Blood_Pressure','c':'Platelets','d':'Serum_creatinine','e':'Serum_sodium'},
#          {0:'Diabetes',1:'High_Blood_Pressure',2:'Platelets',3:'Serum_creatinine',4:'Serum_sodium'}]

df_dict = pd.DataFrame(dict)#, index = ['a','b'])#,'c','d','e']) -- what if we remove comment 

print(df_dict)

# We can define columns of data frame as well
df_dict = pd.DataFrame(dict, columns=[0,1,2,3,4,5])
print('---------')
# We can see column not present in keyset of dictionary added separately with Null values.
print(df_dict)

          0                    1          2                 3             4
0  Diabetes  High_Blood_Pressure  Platelets  Serum_creatinine  Serum_sodium
---------
          0                    1          2                 3             4  \
0  Diabetes  High_Blood_Pressure  Platelets  Serum_creatinine  Serum_sodium   

    5  
0 NaN  


In [59]:
# Creating using ndnumpy arrays
narr = [['Diabetes', 'D'],['High_Blood_Pressure', 'HBP'],['Plateletes', "P"], ['Serum_Creatinine', 'SC']]

#df_ndarr = pd.DataFrame(narr)
df_ndarr = pd.DataFrame(narr, columns=['Impacts', 'Abbrevation'], index=[10,20,30,40])
df_ndarr

Unnamed: 0,Impacts,Abbrevation
10,Diabetes,D
20,High_Blood_Pressure,HBP
30,Plateletes,P
40,Serum_Creatinine,SC


In [72]:
# Adding new column
df_ndarr['Max_Limit'] = [180,110,85000.0,9.4]
df_ndarr['Min_Limit'] = pd.Series([50,80,20000,2.0], index=[10,20,30,40])
df_ndarr

Unnamed: 0,Impacts,Abbrevation,Max_Limit,Min_Limit
10,Diabetes,D,180.0,50.0
20,High_Blood_Pressure,HBP,110.0,80.0
30,Plateletes,P,85000.0,20000.0
40,Serum_Creatinine,SC,9.4,2.0


In [73]:
# Remove column
del(df_ndarr['Max_Limit'])
print("After deleting column")
print(df_ndarr)
df_ndarr.pop('Min_Limit')
print("After pop column")
print(df_ndarr)

After deleting column
                Impacts Abbrevation  Min_Limit
10             Diabetes           D       50.0
20  High_Blood_Pressure         HBP       80.0
30           Plateletes           P    20000.0
40     Serum_Creatinine          SC        2.0
After pop column
                Impacts Abbrevation
10             Diabetes           D
20  High_Blood_Pressure         HBP
30           Plateletes           P
40     Serum_Creatinine          SC


In [106]:
# Dataframe statistics functions like sum, mean, std etc are built on numpy library.

### Other important libraries are Scipy, SciKit-Learn, matplotlib, seaborn, pytorch, keras, Scrapy, BeautifulSoup etc. Some libraries we will use during the course.