### Pandas basic
This notebook contains pandas basic operations

First, check that your virtual environment is in you sys.path, otherwise, adding the virtual environment path to PYTHONPATH

In [44]:
import sys
sys.path

['/home/yuan/git_repos/advanced_python/notebooks/pandas',
 '',
 '/home/yuan/Documents/py3_env/lib/python3.8/site-packages',
 '/usr/lib/python38.zip',
 '/usr/lib/python3.8',
 '/usr/lib/python3.8/lib-dynload',
 '/home/yuan/.local/lib/python3.8/site-packages',
 '/usr/local/lib/python3.8/dist-packages',
 '/usr/lib/python3/dist-packages',
 '/home/yuan/Documents/py3_env/lib/python3.8/site-packages/IPython/extensions',
 '/home/yuan/.ipython']

In [45]:
import pandas as pd
import json
import numpy as np

#### create pandas dataframe
* from a list
* from dictionary
* convert pandas dataframe to json
* convert pandas dataframe to python list
* construct a dataframe from numpy arrays

In [46]:
# create pandas dataframe from a list
a_list = [[1, "Joe"], [2, "Alice"], [3, "Irina"], [4, "Ben"]]
list_df = pd.DataFrame(data=a_list, columns=['site_position', 'first_name'])
list_df

Unnamed: 0,site_position,first_name
0,1,Joe
1,2,Alice
2,3,Irina
3,4,Ben


In [47]:
# create pandas dataframe from a list of dictionary
# the column names are directly defined in each dictionary element
dict_list = [{'site_position': 1, 'first_name': 'Joe'},
              {'site_position': 2, 'first_name': 'Alice'},
            {'site_position': 3, 'first_name': 'Irina'},
             {'site_position': 4, 'first_name': 'Ben'}
            ]
dict_df = pd.DataFrame(dict_list)
dict_df

Unnamed: 0,site_position,first_name
0,1,Joe
1,2,Alice
2,3,Irina
3,4,Ben


In [48]:
# convert a dataframe to json string

# orient='records' convert dataframe to a list of json objects
json_str = dict_df.to_json(orient='records')
json_str

'[{"site_position":1,"first_name":"Joe"},{"site_position":2,"first_name":"Alice"},{"site_position":3,"first_name":"Irina"},{"site_position":4,"first_name":"Ben"}]'

In [49]:
# convert a dataframe to list
# the output can be used to convert it back to dataframe, but you need to add the column names using columns argument
df_list = dict_df.values.tolist()
df_list

[[1, 'Joe'], [2, 'Alice'], [3, 'Irina'], [4, 'Ben']]

In [50]:
# convert a dataframe to a list of json objects, and still maintain the column names
json_list = json.loads(dict_df.to_json(orient='records'))
json_list

[{'site_position': 1, 'first_name': 'Joe'},
 {'site_position': 2, 'first_name': 'Alice'},
 {'site_position': 3, 'first_name': 'Irina'},
 {'site_position': 4, 'first_name': 'Ben'}]

### Transformation
* add rows to a pandas dataframe
* select rows based on not null or empty string

In [51]:
# you can direct set the row using loc
dict_df.loc[len(dict_df.index)] = [5, "Alex"]

# you can use append method by a dictionary
dict_df.append({'site_position': 6, 'first_name': "Jennifer"}, ignore_index=True)

# finally, you can concat dataframes together
dict = {'name':['Joe', 'Tim', 'Rob', 'Georgia'],
        'site_location':[1, 2, 3, 4]
       }  
df1 = pd.DataFrame(dict)
  
dict = {'name':['Amy', 'Maddy'],
        'site_location':[5, 6]
       }  
df2 = pd.DataFrame(dict)
  
df3 = pd.concat([df1, df2], ignore_index = True)
df3

Unnamed: 0,name,site_location
0,Joe,1
1,Tim,2
2,Rob,3
3,Georgia,4
4,Amy,5
5,Maddy,6


#### Construct from multiple dimension np array using hstack

In [52]:
# construct from np arrays
a1 = np.array([[1, 2, 3], [1, 2, 3]])
a2 = np.array([[4, 5, 6], [4, 5, 6]])
print("a1 shape", a1.shape)
print("a2 shape", a2.shape)
array_df = pd.DataFrame(np.hstack([a1, a2]))
print("result df shape by hstack", array_df.shape)
array_df

a1 shape (2, 3)
a2 shape (2, 3)
result df shape by hstack (2, 6)


Unnamed: 0,0,1,2,3,4,5
0,1,2,3,4,5,6
1,1,2,3,4,5,6


#### Construct from one dimension np array using hstack

In [53]:
a1 = np.array([1, 2, 3])
a2 = np.array([4, 5, 6])
array_df = pd.DataFrame(np.hstack([a1.reshape(-1, 1), a2.reshape(-1, 1)]))
array_df

Unnamed: 0,0,1
0,1,4
1,2,5
2,3,6


In [54]:
# select row based on not null and not empty string value of name column
df_new = pd.DataFrame({'name': [None, ""], 'site_location': [7, 8]})
df4 = pd.concat([df3, df_new], ignore_index=True)
df4

Unnamed: 0,name,site_location
0,Joe,1
1,Tim,2
2,Rob,3
3,Georgia,4
4,Amy,5
5,Maddy,6
6,,7
7,,8


In [55]:
(df4.name.notnull()) & (df4['name']!="")

0     True
1     True
2     True
3     True
4     True
5     True
6    False
7    False
Name: name, dtype: bool