In [1]:
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np

# Standard Python

In [2]:
# List comprehension

# The two expressions below will create the same list (of even numbers)
my_list = []
for i in range(0, 9):
    if i % 2 == 0:
        my_list.append(i)
print('Using a loop:')
print(my_list)

# Now using list comprehension..
my_list = [i for i in range(0, 9) if i % 2 == 0]
print('Using list comprehension:')
print(my_list)

Using a loop:
[0, 2, 4, 6, 8]
Using list comprehension:
[0, 2, 4, 6, 8]


# Pandas

In [3]:
df = pd.DataFrame([['M', 'London', 'Billy Bob', '01-02-1990', 60], 
                   ['F', 'London', 'Carol Chen', '13-06-1987', 45], 
                   ['M', 'London', 'Danny Dyer', '13-09-1989', 80],
                   ['M', 'Edinburgh', 'Ewan Eton', '13-09-1989', 70]], columns=['gender', 'city', 'name', 'dob', 'weight'])


In [4]:
df

Unnamed: 0,gender,city,name,dob,weight
0,M,London,Billy Bob,01-02-1990,60
1,F,London,Carol Chen,13-06-1987,45
2,M,London,Danny Dyer,13-09-1989,80
3,M,Edinburgh,Ewan Eton,13-09-1989,70


In [5]:
# Get the second row
df.iloc[1]

gender             F
city          London
name      Carol Chen
dob       13-06-1987
weight            45
Name: 1, dtype: object

In [6]:
# Select an item from this row
df.iloc[1]['name']

'Carol Chen'

In [7]:
# Select a column
df['name'] # or df.name

0     Billy Bob
1    Carol Chen
2    Danny Dyer
3     Ewan Eton
Name: name, dtype: object

In [8]:
# Convert to numpy array
df.values

array([['M', 'London', 'Billy Bob', '01-02-1990', 60],
       ['F', 'London', 'Carol Chen', '13-06-1987', 45],
       ['M', 'London', 'Danny Dyer', '13-09-1989', 80],
       ['M', 'Edinburgh', 'Ewan Eton', '13-09-1989', 70]], dtype=object)

In [9]:
# Groupby gender and get the mean weight
df.groupby('gender')['weight'].mean().reset_index()

Unnamed: 0,gender,weight
0,F,45
1,M,70


In [10]:
# Groupby gender and get the mean weight, the first name and the min dob
df.groupby('gender').agg({'weight': np.mean,
                         'name': 'first',
                         'dob': 'min'}).reset_index()

Unnamed: 0,gender,weight,name,dob
0,F,45,Carol Chen,13-06-1987
1,M,70,Billy Bob,01-02-1990


In [11]:
# Groupby gender AND city and get the mean weight
df.groupby(['gender', 'city'])['weight'].mean().reset_index()

Unnamed: 0,gender,city,weight
0,F,London,45
1,M,Edinburgh,70
2,M,London,70


In [12]:
# Loop over each row in a pandas dataframe
for index, row in df.iterrows():
    print('This row has the index: %d' % index)
    # Select a column from the row
    print(row['name'])
    
    # Note that I have used "reset_index" to change the pandas series back into a pandas dataframe
    display(row.reset_index())

This row has the index: 0
Billy Bob


Unnamed: 0,index,0
0,gender,M
1,city,London
2,name,Billy Bob
3,dob,01-02-1990
4,weight,60


This row has the index: 1
Carol Chen


Unnamed: 0,index,1
0,gender,F
1,city,London
2,name,Carol Chen
3,dob,13-06-1987
4,weight,45


This row has the index: 2
Danny Dyer


Unnamed: 0,index,2
0,gender,M
1,city,London
2,name,Danny Dyer
3,dob,13-09-1989
4,weight,80


This row has the index: 3
Ewan Eton


Unnamed: 0,index,3
0,gender,M
1,city,Edinburgh
2,name,Ewan Eton
3,dob,13-09-1989
4,weight,70


# Sparse

In [13]:
mat = csr_matrix([[1, 0, 0, 0], 
                  [0, 2, 0, 0],
                  [0, 3, 0, 0],
                  [0, 0, 0, 4]])


In [14]:
# Print the values
print(mat)

  (0, 0)	1
  (1, 1)	2
  (2, 1)	3
  (3, 3)	4


In [15]:
# Get the indices of all non zero elements
# Note how these the same numbers as are in the brackets above.
mat.nonzero()

(array([0, 1, 2, 3], dtype=int32), array([0, 1, 1, 3], dtype=int32))

In [16]:
# Get the value at these indices in the matrix
mat[2, 1]

3

In [17]:
# Update a value in the matrix
mat[0, 3] = 5

  self._set_intXint(row, col, x.flat[0])


In [18]:
print(mat)

  (0, 0)	1
  (0, 3)	5
  (1, 1)	2
  (2, 1)	3
  (3, 3)	4


In [19]:
# Look at the matrix as a numpy
mat.todense()

matrix([[1, 0, 0, 5],
        [0, 2, 0, 0],
        [0, 3, 0, 0],
        [0, 0, 0, 4]], dtype=int64)

In [27]:
# Looping over all the non_zero elements of a sparse matrix
row_indices = mat.nonzero()[0]
column_indices = mat.nonzero()[1]


print(row_indices)
print(column_indices)
print('------')

for i in range(0, len(row_indices)):
    row_index = row_indices[i]
    column_index = column_indices[i]
    
    print('(%d, %d)' % (row_index, column_index))
    print(mat[row_index, column_index])

[0 0 1 2 3]
[0 3 1 1 3]
------
(0, 0)
1
(0, 3)
5
(1, 1)
2
(2, 1)
3
(3, 3)
4


In [29]:
# Looping over ALL the elements of sparse matrix (inluding the zero elements)
for row_index in range(0, mat.shape[0]): # Loop over the rows
    for col_index in range(0, mat.shape[1]): # Loop over the columns
        print(mat[row_index, col_index])

1
0
0
5
0
2
0
0
0
3
0
0
0
0
0
4
