Open this notebook in binder
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/yungchidanielcho/python-datascience-ed/blob/master/corning_merck_workshop/notebooks/data_manipulation.ipynb/HEAD)

# Readability counts

- code are read more often than writing
- we read when
    - fix bug
    - add features
    - write tests
    - use other people's functions
- we write for
    - our future selves
    - our peers
    - our future peers
    - our users
    - our successors
- easy to read code contributes to python's success
    - easier to learn
    - easier to add new features
    - easier to spot mistakes
    - easier to hire for mangers

Python's effort to improve readability

https://peps.python.org/pep-0008/

Tools to check and improve format
- checking tool: flake8
- formating tool: black

Zen of python

https://peps.python.org/pep-0020/


# Text file

In [None]:
list_of_lines = ["hi\n","how are you?\n"]

In [None]:
block_of_text = "I'm a paragraph\nSecond paragraph."
print(block_of_text)

In [None]:
with open("text.txt", mode='w') as f:
    f.writelines(list_of_lines)
    f.write(block_of_text)


with open("text.txt", mode='r') as f:
    read_text = f.read()

print(read_text)

In [None]:
with open("text.txt", mode='r') as f:
    read_text_in_list = f.readlines()
read_text_in_list

# Regular expression
- capture patterns in string
For example:
- phone number
- address parsing
- capture words that has capital letters
online tester example: https://regexr.com/

In [None]:
import re
p = re.compile(r'\d+')
p.findall('12 drummers drumming, 11 pipers piping, 10 lords a-leaping')

## JSON

In [None]:
import json
value = {'a': True, 'b': 3}
value

In [None]:
encoded = json.dumps(value)
encoded

In [None]:
decoded = json.loads(encoded)
decoded

In [None]:
decoded == value

## NumPy
* Written in C, exposed through Python
* Extremely fast and memory efficient for numeric processing
* Primary feature is the ndarray - a machine-typed n-dimensional array

In [None]:
import numpy as np
x = np.array([1, 2, 3, 4])
x

In [None]:
print(
    x.sum(),
    x.mean()
)

In [None]:
print(
    x * 2, # arithmetic on ndarrays is element-wise
    x + 2,
    x > 2 # same for boolean expressions
)

In [None]:
x = np.array([1, 2, 3])
y = np.array([4, 5, 6])

x * y

In [None]:
np.dot(x, y)

In [None]:
# Advanced indexing with integer or boolean lists / arrays
print (
    x[[1, 2]],
    x[[False, True, True, False]],
    x[x > 2]
)

In [None]:
# Multidimensional array

mx = np.array([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])
mx.shape

In [None]:
# Array slicing works for each dimension

for nda in [
    mx[0, 0],
    mx[:, 0],
    mx[:, 1:3],
    mx[:, 1:],
    mx[:, :3],
]:
    print(nda)

In [None]:
# np.arange() works like the Python builtin range()

np.arange(10).reshape((2, 5))

In [None]:
np.zeros((2, 5))

In [None]:
np.ones((2, 5))

## Pandas
* Also written in C and exposed in Python
* Extends NumPy
* Data frames / series, transformations, and file handling

In [None]:
!pip install palmerpenguins pandas matplotlib seaborn

In [None]:
import pandas as pd

# Valid URL schemes include http, ftp, s3, gs, and file.
df = pd.read_csv('file://localhost/home/name/work/penguins.csv')
df = pd.read_excel('file://localhost/home/name/work/penguins.xls')

In [None]:
path = 'path/to/file'
df.to_csv(path)
df.to_excel(path)

In [None]:
from palmerpenguins import load_penguins
df = load_penguins()
df

In [None]:
# Columns can be accessed as series. Series behave like NumPy arrays.
# Dataframe[] notation select the lower dimension, which is the columns
# So for Dataframe with
df['bill_length_mm']

In [None]:
df['bill_length_mm'].describe()

In [None]:
df['species'] == 'Adelie'

In [None]:
# slicing by both rows and column use .loc
#.loc use [row, column]

df.loc[df['species'] == 'Adelie', 'bill_length_mm'].describe()

In [None]:
df['bill_diff'] = df['bill_length_mm'] - df['bill_depth_mm']
df

In [None]:
df[['species', 'bill_length_mm', 'bill_depth_mm']].groupby('species').mean()

##  matplotlib

* Python's dominant 2D and 3D visualization platform
* Flexible
* Fairly accessible to new users, extremely powerful for pro users

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

plt.hist(df['bill_length_mm'])
plt.show()

In [None]:
species = df['species'].astype('category').cat

for s in species.categories:
    plt.title(s)
    plt.hist(df['bill_length_mm'][df['species'] == s])
    plt.show()

In [None]:
plt.scatter(df['bill_length_mm'], df['bill_depth_mm'])
plt.show()

In [None]:
plt.title('Bill Shape')
plt.xlabel('Bill length (mm)')
plt.ylabel('Bill depth (mm)')

scatter = plt.scatter(
    df['bill_length_mm'], 
    df['bill_depth_mm'], 
    c=species.codes
)

for i, label in enumerate(species.categories):
    plt.plot([], color=scatter.cmap(scatter.norm(i)), label=label)

plt.legend()
plt.grid(b=True, which='both', color='grey', linestyle='--')
plt.axis('equal')

plt.show()
plt.close()

In [None]:
plt.title('Bill Shape')
plt.xlabel('Bill length (mm)')
plt.ylabel('Bill depth (mm)')

colors = ['b', 'g', 'r']

for i, label in enumerate(species.categories):
    filtered_df = df[df['species'] == label]
    species = filtered_df['species'].astype('category').cat
    scatter = plt.scatter(
        filtered_df['bill_length_mm'], 
        filtered_df['bill_depth_mm'], 
        c=colors[i],
        label=label
    )
    
plt.legend()
plt.grid(b=True, which='both', color='grey', linestyle='--')
plt.axis('equal')

plt.show()
plt.close()

In [None]:
# Seaborn

In [None]:
import seaborn as sns
sns.histplot(df['bill_length_mm'])